@archon-claw/cli 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.d.ts +2 -0
- package/dist/agent.js +152 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +141 -0
- package/dist/config.d.ts +2 -0
- package/dist/config.js +161 -0
- package/dist/eval/assertions.d.ts +9 -0
- package/dist/eval/assertions.js +137 -0
- package/dist/eval/execute.d.ts +13 -0
- package/dist/eval/execute.js +260 -0
- package/dist/eval/formatter.d.ts +10 -0
- package/dist/eval/formatter.js +62 -0
- package/dist/eval/judge.d.ts +7 -0
- package/dist/eval/judge.js +116 -0
- package/dist/eval/runner.d.ts +9 -0
- package/dist/eval/runner.js +156 -0
- package/dist/eval/types.d.ts +67 -0
- package/dist/eval/types.js +1 -0
- package/dist/llm.d.ts +7 -0
- package/dist/llm.js +52 -0
- package/dist/mcp-manager.d.ts +51 -0
- package/dist/mcp-manager.js +268 -0
- package/dist/pending-tool-results.d.ts +4 -0
- package/dist/pending-tool-results.js +39 -0
- package/dist/public/assets/chat-input-BBnVJs9h.js +151 -0
- package/dist/public/assets/chat-input-CISJdhF2.css +1 -0
- package/dist/public/assets/embed-DhIUBDdf.js +1 -0
- package/dist/public/assets/main-Bfvj6DnV.js +16 -0
- package/dist/public/embed/widget.js +233 -0
- package/dist/public/embed.html +14 -0
- package/dist/public/index.html +14 -0
- package/dist/scaffold.d.ts +2 -0
- package/dist/scaffold.js +82 -0
- package/dist/schemas.d.ts +899 -0
- package/dist/schemas.js +134 -0
- package/dist/server.d.ts +3 -0
- package/dist/server.js +258 -0
- package/dist/session.d.ts +8 -0
- package/dist/session.js +70 -0
- package/dist/templates/agent/model.json +6 -0
- package/dist/templates/agent/system-prompt.md +9 -0
- package/dist/templates/agent/tool-impls/greeting.impl.js +9 -0
- package/dist/templates/agent/tools/greeting.json +14 -0
- package/dist/templates/workspace/.claude/skills/create-agent/SKILL.md +90 -0
- package/dist/templates/workspace/.claude/skills/create-dataset/SKILL.md +57 -0
- package/dist/templates/workspace/.claude/skills/create-eval-case/SKILL.md +159 -0
- package/dist/templates/workspace/.claude/skills/create-eval-judge/SKILL.md +128 -0
- package/dist/templates/workspace/.claude/skills/create-mcp-config/SKILL.md +151 -0
- package/dist/templates/workspace/.claude/skills/create-model-config/SKILL.md +45 -0
- package/dist/templates/workspace/.claude/skills/create-skill/SKILL.md +63 -0
- package/dist/templates/workspace/.claude/skills/create-system-prompt/SKILL.md +168 -0
- package/dist/templates/workspace/.claude/skills/create-tool/SKILL.md +56 -0
- package/dist/templates/workspace/.claude/skills/create-tool-impl/SKILL.md +83 -0
- package/dist/templates/workspace/.claude/skills/create-tool-test/SKILL.md +117 -0
- package/dist/templates/workspace/.claude/skills/create-tool-ui/SKILL.md +218 -0
- package/dist/test-runner.d.ts +22 -0
- package/dist/test-runner.js +166 -0
- package/dist/types.d.ts +75 -0
- package/dist/types.js +1 -0
- package/dist/validator/index.d.ts +16 -0
- package/dist/validator/index.js +54 -0
- package/dist/validator/plugin.d.ts +21 -0
- package/dist/validator/plugin.js +1 -0
- package/dist/validator/plugins/agent-dir.d.ts +2 -0
- package/dist/validator/plugins/agent-dir.js +171 -0
- package/dist/validator/plugins/agent-skill.d.ts +2 -0
- package/dist/validator/plugins/agent-skill.js +31 -0
- package/dist/validator/plugins/dataset.d.ts +2 -0
- package/dist/validator/plugins/dataset.js +20 -0
- package/dist/validator/plugins/mcp.d.ts +2 -0
- package/dist/validator/plugins/mcp.js +20 -0
- package/dist/validator/plugins/model.d.ts +2 -0
- package/dist/validator/plugins/model.js +20 -0
- package/dist/validator/plugins/system-prompt.d.ts +2 -0
- package/dist/validator/plugins/system-prompt.js +25 -0
- package/dist/validator/plugins/tool.d.ts +2 -0
- package/dist/validator/plugins/tool.js +20 -0
- package/dist/validator/zod-utils.d.ts +3 -0
- package/dist/validator/zod-utils.js +7 -0
- package/package.json +41 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import { createClient, toOpenAITools, streamChat } from "../llm.js";
|
|
2
|
+
import { runAssertions } from "./assertions.js";
|
|
3
|
+
const MAX_ITERATIONS = 20;
|
|
4
|
+
/**
|
|
5
|
+
* Execute a single eval case, dispatching to the appropriate mode.
|
|
6
|
+
*/
|
|
7
|
+
export async function executeCase(opts) {
|
|
8
|
+
const start = Date.now();
|
|
9
|
+
switch (opts.evalCase.mode) {
|
|
10
|
+
case "single":
|
|
11
|
+
return executeSingle(opts, start);
|
|
12
|
+
case "injected":
|
|
13
|
+
return executeInjected(opts, start);
|
|
14
|
+
case "sequential":
|
|
15
|
+
return executeSequential(opts, start);
|
|
16
|
+
default:
|
|
17
|
+
return {
|
|
18
|
+
file: opts.fileName,
|
|
19
|
+
case: opts.evalCase.name,
|
|
20
|
+
mode: opts.evalCase.mode,
|
|
21
|
+
passed: false,
|
|
22
|
+
duration: Date.now() - start,
|
|
23
|
+
response: "",
|
|
24
|
+
toolCalls: [],
|
|
25
|
+
assertionResults: [
|
|
26
|
+
{
|
|
27
|
+
type: "contains",
|
|
28
|
+
value: "",
|
|
29
|
+
passed: false,
|
|
30
|
+
message: `unknown mode: ${opts.evalCase.mode}`,
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Collect text and tool calls from one LLM generation cycle (may include
|
|
38
|
+
* multiple tool-call loops).
|
|
39
|
+
*/
|
|
40
|
+
async function runGeneration(config, messages, toolSubset) {
|
|
41
|
+
const client = createClient(config.model);
|
|
42
|
+
// Filter tools if a subset is specified
|
|
43
|
+
let tools = config.tools;
|
|
44
|
+
if (toolSubset) {
|
|
45
|
+
tools = config.tools.filter((t) => toolSubset.includes(t.name));
|
|
46
|
+
}
|
|
47
|
+
const openAITools = toOpenAITools(tools);
|
|
48
|
+
let fullText = "";
|
|
49
|
+
const allToolCalls = [];
|
|
50
|
+
for (let i = 0; i < MAX_ITERATIONS; i++) {
|
|
51
|
+
const stream = streamChat(client, config.model, messages, openAITools);
|
|
52
|
+
let textContent = "";
|
|
53
|
+
const toolCallsMap = new Map();
|
|
54
|
+
for await (const chunk of stream) {
|
|
55
|
+
const delta = chunk.choices[0]?.delta;
|
|
56
|
+
if (!delta)
|
|
57
|
+
continue;
|
|
58
|
+
if (delta.content) {
|
|
59
|
+
textContent += delta.content;
|
|
60
|
+
}
|
|
61
|
+
if (delta.tool_calls) {
|
|
62
|
+
for (const tc of delta.tool_calls) {
|
|
63
|
+
const existing = toolCallsMap.get(tc.index);
|
|
64
|
+
if (existing) {
|
|
65
|
+
if (tc.function?.arguments) {
|
|
66
|
+
existing.args += tc.function.arguments;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
toolCallsMap.set(tc.index, {
|
|
71
|
+
id: tc.id ?? "",
|
|
72
|
+
name: tc.function?.name ?? "",
|
|
73
|
+
args: tc.function?.arguments ?? "",
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Build assistant message
|
|
80
|
+
const assistantToolCalls = [...toolCallsMap.values()].map((tc) => ({
|
|
81
|
+
id: tc.id,
|
|
82
|
+
type: "function",
|
|
83
|
+
function: { name: tc.name, arguments: tc.args },
|
|
84
|
+
}));
|
|
85
|
+
const assistantMessage = assistantToolCalls.length > 0
|
|
86
|
+
? { role: "assistant", content: textContent || null, tool_calls: assistantToolCalls }
|
|
87
|
+
: { role: "assistant", content: textContent };
|
|
88
|
+
messages.push(assistantMessage);
|
|
89
|
+
fullText += textContent;
|
|
90
|
+
// No tool calls — done
|
|
91
|
+
if (assistantToolCalls.length === 0)
|
|
92
|
+
break;
|
|
93
|
+
// Execute tool calls
|
|
94
|
+
for (const tc of assistantToolCalls) {
|
|
95
|
+
const args = JSON.parse(tc.function.arguments);
|
|
96
|
+
const impl = config.toolImpls.get(tc.function.name);
|
|
97
|
+
let result;
|
|
98
|
+
if (impl) {
|
|
99
|
+
try {
|
|
100
|
+
result = await impl(args);
|
|
101
|
+
}
|
|
102
|
+
catch (err) {
|
|
103
|
+
result = { error: err instanceof Error ? err.message : String(err) };
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
result = { error: `Unknown tool: ${tc.function.name}` };
|
|
108
|
+
}
|
|
109
|
+
allToolCalls.push({ name: tc.function.name, args, result });
|
|
110
|
+
messages.push({
|
|
111
|
+
role: "tool",
|
|
112
|
+
tool_call_id: tc.id,
|
|
113
|
+
content: JSON.stringify(result),
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return { text: fullText, toolCalls: allToolCalls, messages };
|
|
118
|
+
}
|
|
119
|
+
// ---- Mode 1: Single ----
|
|
120
|
+
async function executeSingle(opts, start) {
|
|
121
|
+
const { config, evalCase, fileName, judgeRunner } = opts;
|
|
122
|
+
const userTurn = evalCase.turns[0];
|
|
123
|
+
const messages = [
|
|
124
|
+
{ role: "system", content: config.systemPrompt },
|
|
125
|
+
{ role: "user", content: userTurn.content },
|
|
126
|
+
];
|
|
127
|
+
const { text, toolCalls } = await runGeneration(config, messages, evalCase.tools);
|
|
128
|
+
const assertionResults = runAssertions(evalCase.assertions ?? [], text, toolCalls);
|
|
129
|
+
const passed = assertionResults.every((a) => a.passed);
|
|
130
|
+
let judgeResult;
|
|
131
|
+
if (judgeRunner) {
|
|
132
|
+
judgeResult = await judgeRunner(userTurn.content, evalCase.expectedOutput, text, messages);
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
file: fileName,
|
|
136
|
+
case: evalCase.name,
|
|
137
|
+
mode: "single",
|
|
138
|
+
passed,
|
|
139
|
+
duration: Date.now() - start,
|
|
140
|
+
response: text,
|
|
141
|
+
toolCalls,
|
|
142
|
+
assertionResults,
|
|
143
|
+
judgeResult,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
// ---- Mode 2: Injected ----
|
|
147
|
+
async function executeInjected(opts, start) {
|
|
148
|
+
const { config, evalCase, fileName, judgeRunner } = opts;
|
|
149
|
+
const messages = [
|
|
150
|
+
{ role: "system", content: config.systemPrompt },
|
|
151
|
+
];
|
|
152
|
+
// Inject all turns except the last user turn as history
|
|
153
|
+
for (const turn of evalCase.turns.slice(0, -1)) {
|
|
154
|
+
if (turn.role === "user") {
|
|
155
|
+
messages.push({ role: "user", content: turn.content });
|
|
156
|
+
}
|
|
157
|
+
else if (turn.role === "assistant") {
|
|
158
|
+
const msg = { role: "assistant", content: turn.content };
|
|
159
|
+
if (turn.toolCalls && turn.toolCalls.length > 0) {
|
|
160
|
+
const tcs = turn.toolCalls.map((tc, idx) => ({
|
|
161
|
+
id: `injected_${idx}`,
|
|
162
|
+
type: "function",
|
|
163
|
+
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
|
|
164
|
+
}));
|
|
165
|
+
msg.tool_calls = tcs;
|
|
166
|
+
// Add tool result messages
|
|
167
|
+
for (const tc of turn.toolCalls) {
|
|
168
|
+
messages.push({
|
|
169
|
+
role: "tool",
|
|
170
|
+
tool_call_id: `injected_${turn.toolCalls.indexOf(tc)}`,
|
|
171
|
+
content: JSON.stringify(tc.result),
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
messages.push(msg);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// Add the last user turn (triggers generation)
|
|
179
|
+
const lastTurn = evalCase.turns[evalCase.turns.length - 1];
|
|
180
|
+
messages.push({ role: "user", content: lastTurn.content });
|
|
181
|
+
const { text, toolCalls } = await runGeneration(config, messages, evalCase.tools);
|
|
182
|
+
const assertionResults = runAssertions(evalCase.assertions ?? [], text, toolCalls);
|
|
183
|
+
const passed = assertionResults.every((a) => a.passed);
|
|
184
|
+
let judgeResult;
|
|
185
|
+
if (judgeRunner) {
|
|
186
|
+
judgeResult = await judgeRunner(lastTurn.content, evalCase.expectedOutput, text, messages);
|
|
187
|
+
}
|
|
188
|
+
return {
|
|
189
|
+
file: fileName,
|
|
190
|
+
case: evalCase.name,
|
|
191
|
+
mode: "injected",
|
|
192
|
+
passed,
|
|
193
|
+
duration: Date.now() - start,
|
|
194
|
+
response: text,
|
|
195
|
+
toolCalls,
|
|
196
|
+
assertionResults,
|
|
197
|
+
judgeResult,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
// ---- Mode 3: Sequential ----
|
|
201
|
+
async function executeSequential(opts, start) {
|
|
202
|
+
const { config, evalCase, fileName, judgeRunner } = opts;
|
|
203
|
+
const messages = [
|
|
204
|
+
{ role: "system", content: config.systemPrompt },
|
|
205
|
+
];
|
|
206
|
+
const allToolCalls = [];
|
|
207
|
+
const turnResults = [];
|
|
208
|
+
let lastResponse = "";
|
|
209
|
+
let allPassed = true;
|
|
210
|
+
for (const turn of evalCase.turns) {
|
|
211
|
+
if (turn.role !== "user")
|
|
212
|
+
continue;
|
|
213
|
+
messages.push({ role: "user", content: turn.content });
|
|
214
|
+
const { text, toolCalls } = await runGeneration(config, messages, evalCase.tools);
|
|
215
|
+
lastResponse = text;
|
|
216
|
+
allToolCalls.push(...toolCalls);
|
|
217
|
+
// Per-turn assertions
|
|
218
|
+
let turnAssertionResults;
|
|
219
|
+
if (turn.assertions && turn.assertions.length > 0) {
|
|
220
|
+
turnAssertionResults = runAssertions(turn.assertions, text, toolCalls);
|
|
221
|
+
if (turnAssertionResults.some((a) => !a.passed)) {
|
|
222
|
+
allPassed = false;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
// Per-turn judge
|
|
226
|
+
let turnJudgeResult;
|
|
227
|
+
if (judgeRunner && turn.judge) {
|
|
228
|
+
turnJudgeResult = await judgeRunner(turn.content, turn.expectedOutput, text, messages);
|
|
229
|
+
}
|
|
230
|
+
turnResults.push({
|
|
231
|
+
role: "assistant",
|
|
232
|
+
content: text,
|
|
233
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
|
|
234
|
+
assertionResults: turnAssertionResults,
|
|
235
|
+
judgeResult: turnJudgeResult,
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
// Case-level assertions on the final response
|
|
239
|
+
const caseAssertionResults = runAssertions(evalCase.assertions ?? [], lastResponse, allToolCalls);
|
|
240
|
+
if (caseAssertionResults.some((a) => !a.passed)) {
|
|
241
|
+
allPassed = false;
|
|
242
|
+
}
|
|
243
|
+
// Case-level judge
|
|
244
|
+
let judgeResult;
|
|
245
|
+
if (judgeRunner) {
|
|
246
|
+
judgeResult = await judgeRunner(evalCase.turns.filter((t) => t.role === "user").map((t) => t.content).join("\n"), evalCase.expectedOutput, lastResponse, messages);
|
|
247
|
+
}
|
|
248
|
+
return {
|
|
249
|
+
file: fileName,
|
|
250
|
+
case: evalCase.name,
|
|
251
|
+
mode: "sequential",
|
|
252
|
+
passed: allPassed,
|
|
253
|
+
duration: Date.now() - start,
|
|
254
|
+
response: lastResponse,
|
|
255
|
+
turnResults,
|
|
256
|
+
toolCalls: allToolCalls,
|
|
257
|
+
assertionResults: caseAssertionResults,
|
|
258
|
+
judgeResult,
|
|
259
|
+
};
|
|
260
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { CaseResult, EvalFile } from "./types.js";
|
|
2
|
+
interface LoadedEvalFile {
|
|
3
|
+
fileName: string;
|
|
4
|
+
data: EvalFile;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Format eval results for console output.
|
|
8
|
+
*/
|
|
9
|
+
export declare function formatEvalResults(evalFiles: LoadedEvalFile[], results: CaseResult[]): string;
|
|
10
|
+
export {};
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format eval results for console output.
|
|
3
|
+
*/
|
|
4
|
+
export function formatEvalResults(evalFiles, results) {
|
|
5
|
+
const lines = [];
|
|
6
|
+
for (const ef of evalFiles) {
|
|
7
|
+
lines.push(`\n eval: ${ef.data.name} (${ef.fileName})\n`);
|
|
8
|
+
const fileResults = results.filter((r) => r.file === ef.fileName);
|
|
9
|
+
for (const r of fileResults) {
|
|
10
|
+
const icon = r.passed ? "\u2713" : "\u2717";
|
|
11
|
+
const durationStr = `${r.duration}ms`;
|
|
12
|
+
lines.push(` ${icon} ${r.case} (${r.mode})${padRight(durationStr, 40, r.case.length + r.mode.length + 7)}`);
|
|
13
|
+
// Show assertion details for failed cases or when there are assertions
|
|
14
|
+
if (!r.passed || r.assertionResults.length > 0) {
|
|
15
|
+
for (const a of r.assertionResults) {
|
|
16
|
+
const aIcon = a.passed ? "\u2713" : "\u2717";
|
|
17
|
+
const detail = a.message ? ` (${a.message})` : "";
|
|
18
|
+
lines.push(` - ${a.type} "${a.value}"${padRight(aIcon + detail, 30, a.type.length + a.value.length + 5)}`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
// Show turn assertion details for sequential mode
|
|
22
|
+
if (r.turnResults) {
|
|
23
|
+
for (const tr of r.turnResults) {
|
|
24
|
+
if (tr.assertionResults) {
|
|
25
|
+
for (const a of tr.assertionResults) {
|
|
26
|
+
if (!a.passed) {
|
|
27
|
+
const detail = a.message ? ` (${a.message})` : "";
|
|
28
|
+
lines.push(` - ${a.type} "${a.value}" \u2717${detail}`);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
// Show judge result if available
|
|
35
|
+
if (r.judgeResult) {
|
|
36
|
+
const scores = Object.entries(r.judgeResult.scores)
|
|
37
|
+
.map(([key, s]) => `${key}=${s.score}`)
|
|
38
|
+
.join(" ");
|
|
39
|
+
lines.push(` - judge: ${scores} avg=${r.judgeResult.overallScore}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// Summary
|
|
44
|
+
const totalCases = results.length;
|
|
45
|
+
const passed = results.filter((r) => r.passed).length;
|
|
46
|
+
const failed = totalCases - passed;
|
|
47
|
+
lines.push("");
|
|
48
|
+
lines.push(` ${totalCases} cases, ${passed} passed, ${failed} failed`);
|
|
49
|
+
// Average judge score
|
|
50
|
+
const judgeScores = results
|
|
51
|
+
.filter((r) => r.judgeResult)
|
|
52
|
+
.map((r) => r.judgeResult.overallScore);
|
|
53
|
+
if (judgeScores.length > 0) {
|
|
54
|
+
const avg = Math.round((judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length) * 10) / 10;
|
|
55
|
+
lines.push(` Average judge score: ${avg} / 10`);
|
|
56
|
+
}
|
|
57
|
+
return lines.join("\n");
|
|
58
|
+
}
|
|
59
|
+
function padRight(suffix, targetGap, contentLength) {
|
|
60
|
+
const gap = Math.max(2, targetGap - contentLength);
|
|
61
|
+
return " ".repeat(gap) + suffix;
|
|
62
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { ChatMessage, ModelConfig } from "../types.js";
|
|
2
|
+
import type { JudgeConfig, JudgeResult } from "./types.js";
|
|
3
|
+
export type JudgeRunnerFn = (userInput: string, expectedOutput: string | undefined, actualResponse: string, conversation: ChatMessage[]) => Promise<JudgeResult>;
|
|
4
|
+
/**
|
|
5
|
+
* Create a judge runner function from a JudgeConfig.
|
|
6
|
+
*/
|
|
7
|
+
export declare function createJudgeRunner(judgeConfig: JudgeConfig, fallbackModel: ModelConfig): JudgeRunnerFn;
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { Liquid } from "liquidjs";
|
|
2
|
+
import { createClient } from "../llm.js";
|
|
3
|
+
const DEFAULT_PROMPT_TEMPLATE = `请评估以下 AI 助手的回复质量。
|
|
4
|
+
|
|
5
|
+
用户输入:{{ user_input }}
|
|
6
|
+
{% if expected_output %}期望输出:{{ expected_output }}{% endif %}
|
|
7
|
+
实际回复:{{ actual_response }}
|
|
8
|
+
|
|
9
|
+
请对以下每个维度独立评估,并给出简短理由。
|
|
10
|
+
{% for dim in dimensions %}
|
|
11
|
+
{% if dim.type == "binary" %}- {{ dim.label }}({{ dim.key }}):通过 true / 不通过 false
|
|
12
|
+
{% else %}- {{ dim.label }}({{ dim.key }}):{{ dim.min }} - {{ dim.max }} 分
|
|
13
|
+
{% endif %}{% endfor %}
|
|
14
|
+
|
|
15
|
+
请严格以下面的 JSON 格式回复,不要输出其他内容:
|
|
16
|
+
{
|
|
17
|
+
{% for dim in dimensions %}{% if dim.type == "binary" %} "{{ dim.key }}": { "score": <true 或 false>, "reason": "<理由>" }{% else %} "{{ dim.key }}": { "score": <分数>, "reason": "<理由>" }{% endif %}{% unless forloop.last %},{% endunless %}
|
|
18
|
+
{% endfor %}}`;
|
|
19
|
+
/**
|
|
20
|
+
* Create a judge runner function from a JudgeConfig.
|
|
21
|
+
*/
|
|
22
|
+
export function createJudgeRunner(judgeConfig, fallbackModel) {
|
|
23
|
+
const model = judgeConfig.model ?? fallbackModel;
|
|
24
|
+
const client = createClient(model);
|
|
25
|
+
const engine = new Liquid();
|
|
26
|
+
return async (userInput, expectedOutput, actualResponse, conversation) => {
|
|
27
|
+
const template = judgeConfig.promptTemplate ?? DEFAULT_PROMPT_TEMPLATE;
|
|
28
|
+
const prompt = await engine.parseAndRender(template, {
|
|
29
|
+
user_input: userInput,
|
|
30
|
+
expected_output: expectedOutput ?? "",
|
|
31
|
+
actual_response: actualResponse,
|
|
32
|
+
conversation: conversation.map((m) => `${m.role}: ${m.content}`).join("\n"),
|
|
33
|
+
dimensions: judgeConfig.dimensions.map((d) => ({
|
|
34
|
+
...d,
|
|
35
|
+
type: d.type ?? "numeric",
|
|
36
|
+
min: d.min ?? 0,
|
|
37
|
+
max: d.max ?? 10,
|
|
38
|
+
})),
|
|
39
|
+
});
|
|
40
|
+
const response = await client.chat.completions.create({
|
|
41
|
+
model: model.model,
|
|
42
|
+
messages: [{ role: "user", content: prompt }],
|
|
43
|
+
max_tokens: model.maxTokens ?? 2048,
|
|
44
|
+
temperature: 0.1,
|
|
45
|
+
});
|
|
46
|
+
const content = response.choices[0]?.message?.content ?? "";
|
|
47
|
+
return parseJudgeResponse(content, judgeConfig);
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Parse the judge LLM response into structured scores.
|
|
52
|
+
*/
|
|
53
|
+
function parseJudgeResponse(content, config) {
|
|
54
|
+
const scores = {};
|
|
55
|
+
try {
|
|
56
|
+
// Extract JSON from the response (may be wrapped in markdown code blocks)
|
|
57
|
+
const jsonMatch = content.match(/\{[\s\S]*\}/);
|
|
58
|
+
if (!jsonMatch) {
|
|
59
|
+
throw new Error("No JSON found in judge response");
|
|
60
|
+
}
|
|
61
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
62
|
+
for (const dim of config.dimensions) {
|
|
63
|
+
const entry = parsed[dim.key];
|
|
64
|
+
if (!entry) {
|
|
65
|
+
scores[dim.key] = { score: 0, reason: "Failed to parse score" };
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
if ((dim.type ?? "numeric") === "binary") {
|
|
69
|
+
// Binary: convert boolean to 0/1
|
|
70
|
+
const val = typeof entry.score === "boolean" ? entry.score : !!entry.score;
|
|
71
|
+
scores[dim.key] = {
|
|
72
|
+
score: val ? 1 : 0,
|
|
73
|
+
reason: entry.reason ?? "",
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
// Numeric
|
|
78
|
+
if (typeof entry.score === "number") {
|
|
79
|
+
scores[dim.key] = {
|
|
80
|
+
score: entry.score,
|
|
81
|
+
reason: entry.reason ?? "",
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
scores[dim.key] = { score: 0, reason: "Failed to parse score" };
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
catch {
|
|
91
|
+
for (const dim of config.dimensions) {
|
|
92
|
+
scores[dim.key] = { score: 0, reason: "Failed to parse judge response" };
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
// Calculate weighted overall score
|
|
96
|
+
let totalWeight = 0;
|
|
97
|
+
let weightedSum = 0;
|
|
98
|
+
for (const dim of config.dimensions) {
|
|
99
|
+
const s = scores[dim.key];
|
|
100
|
+
if (s) {
|
|
101
|
+
if ((dim.type ?? "numeric") === "binary") {
|
|
102
|
+
// Binary: 0 or 1 → normalize to 0-10
|
|
103
|
+
weightedSum += s.score * 10 * dim.weight;
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
const max = dim.max ?? 10;
|
|
107
|
+
const min = dim.min ?? 0;
|
|
108
|
+
const normalized = max !== min ? ((s.score - min) / (max - min)) * 10 : 0;
|
|
109
|
+
weightedSum += normalized * dim.weight;
|
|
110
|
+
}
|
|
111
|
+
totalWeight += dim.weight;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
const overallScore = totalWeight > 0 ? Math.round((weightedSum / totalWeight) * 10) / 10 : 0;
|
|
115
|
+
return { scores, overallScore };
|
|
116
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { EvalSummary, EvalDetails, EvalOptions } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Run all eval files for an agent directory.
|
|
4
|
+
*/
|
|
5
|
+
export declare function runEvals(agentDir: string, options?: EvalOptions): Promise<{
|
|
6
|
+
summary: EvalSummary;
|
|
7
|
+
details: EvalDetails;
|
|
8
|
+
formatted: string;
|
|
9
|
+
}>;
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { loadAgentConfig } from "../config.js";
|
|
4
|
+
import { executeCase } from "./execute.js";
|
|
5
|
+
import { createJudgeRunner } from "./judge.js";
|
|
6
|
+
import { formatEvalResults } from "./formatter.js";
|
|
7
|
+
/**
|
|
8
|
+
* Run all eval files for an agent directory.
|
|
9
|
+
*/
|
|
10
|
+
export async function runEvals(agentDir, options = {}) {
|
|
11
|
+
const absDir = path.resolve(agentDir);
|
|
12
|
+
const casesDir = path.join(absDir, "eval-cases");
|
|
13
|
+
const judgesDir = path.join(absDir, "eval-judges");
|
|
14
|
+
// Load agent config
|
|
15
|
+
const config = await loadAgentConfig(agentDir);
|
|
16
|
+
// Load eval files from eval-cases/
|
|
17
|
+
let evalFiles = await loadEvalFiles(casesDir);
|
|
18
|
+
// Filter by file
|
|
19
|
+
if (options.file) {
|
|
20
|
+
evalFiles = evalFiles.filter((f) => f.fileName === options.file);
|
|
21
|
+
if (evalFiles.length === 0) {
|
|
22
|
+
throw new Error(`Eval file not found: ${options.file}`);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
// Filter cases by tag
|
|
26
|
+
if (options.tag) {
|
|
27
|
+
for (const ef of evalFiles) {
|
|
28
|
+
ef.data.cases = ef.data.cases.filter((c) => c.tags && c.tags.includes(options.tag));
|
|
29
|
+
}
|
|
30
|
+
evalFiles = evalFiles.filter((ef) => ef.data.cases.length > 0);
|
|
31
|
+
}
|
|
32
|
+
if (evalFiles.length === 0) {
|
|
33
|
+
throw new Error("No eval cases found");
|
|
34
|
+
}
|
|
35
|
+
// Load judge configs from eval-judges/ (optional)
|
|
36
|
+
let judgeRunners;
|
|
37
|
+
if (!options.noJudge) {
|
|
38
|
+
const judgeConfigs = await loadJudgeConfigs(judgesDir);
|
|
39
|
+
if (judgeConfigs.size > 0) {
|
|
40
|
+
judgeRunners = new Map();
|
|
41
|
+
for (const [name, judgeConfig] of judgeConfigs) {
|
|
42
|
+
judgeRunners.set(name, createJudgeRunner(judgeConfig, config.model));
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
// Run all cases
|
|
47
|
+
const results = [];
|
|
48
|
+
const startTime = Date.now();
|
|
49
|
+
for (const ef of evalFiles) {
|
|
50
|
+
for (const evalCase of ef.data.cases) {
|
|
51
|
+
// Resolve judge runner for this case
|
|
52
|
+
let judgeRunner;
|
|
53
|
+
if (judgeRunners) {
|
|
54
|
+
const judgeName = evalCase.judge ?? "default";
|
|
55
|
+
judgeRunner = judgeRunners.get(judgeName);
|
|
56
|
+
}
|
|
57
|
+
const result = await executeCase({
|
|
58
|
+
config,
|
|
59
|
+
evalCase,
|
|
60
|
+
fileName: ef.fileName,
|
|
61
|
+
judgeRunner,
|
|
62
|
+
});
|
|
63
|
+
results.push(result);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
const totalDuration = Date.now() - startTime;
|
|
67
|
+
// Build summary
|
|
68
|
+
const summary = buildSummary(results, absDir, config, totalDuration, evalFiles);
|
|
69
|
+
const details = { results };
|
|
70
|
+
// Save results if requested
|
|
71
|
+
if (options.save) {
|
|
72
|
+
await saveResults(absDir, summary, details);
|
|
73
|
+
}
|
|
74
|
+
const formatted = formatEvalResults(evalFiles, results);
|
|
75
|
+
return { summary, details, formatted };
|
|
76
|
+
}
|
|
77
|
+
async function loadEvalFiles(casesDir) {
|
|
78
|
+
let files;
|
|
79
|
+
try {
|
|
80
|
+
files = await fs.readdir(casesDir);
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
throw new Error(`No eval-cases/ directory found`);
|
|
84
|
+
}
|
|
85
|
+
const evalFiles = files.filter((f) => f.endsWith(".eval.json"));
|
|
86
|
+
if (evalFiles.length === 0) {
|
|
87
|
+
throw new Error("No .eval.json files found in eval-cases/");
|
|
88
|
+
}
|
|
89
|
+
const loaded = [];
|
|
90
|
+
for (const file of evalFiles) {
|
|
91
|
+
const content = await fs.readFile(path.join(casesDir, file), "utf-8");
|
|
92
|
+
const parsed = JSON.parse(content);
|
|
93
|
+
loaded.push({ fileName: file, data: parsed });
|
|
94
|
+
}
|
|
95
|
+
return loaded;
|
|
96
|
+
}
|
|
97
|
+
async function loadJudgeConfigs(judgesDir) {
|
|
98
|
+
const configs = new Map();
|
|
99
|
+
let files;
|
|
100
|
+
try {
|
|
101
|
+
files = await fs.readdir(judgesDir);
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return configs; // eval-judges/ is optional
|
|
105
|
+
}
|
|
106
|
+
const jsonFiles = files.filter((f) => f.endsWith(".json"));
|
|
107
|
+
for (const file of jsonFiles) {
|
|
108
|
+
const name = file.replace(/\.json$/, "");
|
|
109
|
+
const content = await fs.readFile(path.join(judgesDir, file), "utf-8");
|
|
110
|
+
const parsed = JSON.parse(content);
|
|
111
|
+
configs.set(name, parsed);
|
|
112
|
+
}
|
|
113
|
+
return configs;
|
|
114
|
+
}
|
|
115
|
+
function buildSummary(results, absDir, config, totalDuration, evalFiles) {
|
|
116
|
+
const passed = results.filter((r) => r.passed).length;
|
|
117
|
+
const failed = results.length - passed;
|
|
118
|
+
// Average judge score
|
|
119
|
+
const judgeScores = results
|
|
120
|
+
.filter((r) => r.judgeResult)
|
|
121
|
+
.map((r) => r.judgeResult.overallScore);
|
|
122
|
+
const averageScore = judgeScores.length > 0
|
|
123
|
+
? Math.round((judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length) * 10) / 10
|
|
124
|
+
: null;
|
|
125
|
+
// Per-file stats
|
|
126
|
+
const fileResults = evalFiles.map((ef) => {
|
|
127
|
+
const fileResults = results.filter((r) => r.file === ef.fileName);
|
|
128
|
+
const filePassed = fileResults.filter((r) => r.passed).length;
|
|
129
|
+
return {
|
|
130
|
+
file: ef.fileName,
|
|
131
|
+
name: ef.data.name,
|
|
132
|
+
cases: fileResults.length,
|
|
133
|
+
passed: filePassed,
|
|
134
|
+
failed: fileResults.length - filePassed,
|
|
135
|
+
};
|
|
136
|
+
});
|
|
137
|
+
return {
|
|
138
|
+
timestamp: new Date().toISOString(),
|
|
139
|
+
agent: path.basename(absDir),
|
|
140
|
+
model: `${config.model.provider}/${config.model.model}`,
|
|
141
|
+
totalCases: results.length,
|
|
142
|
+
passed,
|
|
143
|
+
failed,
|
|
144
|
+
averageScore,
|
|
145
|
+
duration: totalDuration,
|
|
146
|
+
files: fileResults,
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
async function saveResults(absDir, summary, details) {
|
|
150
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19);
|
|
151
|
+
const resultsDir = path.join(absDir, "eval-results", timestamp);
|
|
152
|
+
await fs.mkdir(resultsDir, { recursive: true });
|
|
153
|
+
await fs.writeFile(path.join(resultsDir, "summary.json"), JSON.stringify(summary, null, 2));
|
|
154
|
+
await fs.writeFile(path.join(resultsDir, "details.json"), JSON.stringify(details, null, 2));
|
|
155
|
+
return resultsDir;
|
|
156
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
export type { AssertionType, Assertion, EvalTurnToolCall, EvalTurn, EvalMode, EvalCase, EvalFile, JudgeDimension, JudgeConfig, } from "../schemas.js";
|
|
2
|
+
import type { AssertionType, EvalMode } from "../schemas.js";
|
|
3
|
+
export interface AssertionResult {
|
|
4
|
+
type: AssertionType;
|
|
5
|
+
value: string;
|
|
6
|
+
passed: boolean;
|
|
7
|
+
message?: string;
|
|
8
|
+
}
|
|
9
|
+
export interface JudgeScore {
|
|
10
|
+
score: number;
|
|
11
|
+
reason: string;
|
|
12
|
+
}
|
|
13
|
+
export interface JudgeResult {
|
|
14
|
+
scores: Record<string, JudgeScore>;
|
|
15
|
+
overallScore: number;
|
|
16
|
+
}
|
|
17
|
+
export interface ToolCallRecord {
|
|
18
|
+
name: string;
|
|
19
|
+
args: Record<string, unknown>;
|
|
20
|
+
result: unknown;
|
|
21
|
+
}
|
|
22
|
+
export interface TurnResult {
|
|
23
|
+
role: "user" | "assistant";
|
|
24
|
+
content: string;
|
|
25
|
+
toolCalls?: ToolCallRecord[];
|
|
26
|
+
assertionResults?: AssertionResult[];
|
|
27
|
+
judgeResult?: JudgeResult;
|
|
28
|
+
}
|
|
29
|
+
export interface CaseResult {
|
|
30
|
+
file: string;
|
|
31
|
+
case: string;
|
|
32
|
+
mode: EvalMode;
|
|
33
|
+
passed: boolean;
|
|
34
|
+
duration: number;
|
|
35
|
+
response: string;
|
|
36
|
+
turnResults?: TurnResult[];
|
|
37
|
+
toolCalls: ToolCallRecord[];
|
|
38
|
+
assertionResults: AssertionResult[];
|
|
39
|
+
judgeResult?: JudgeResult;
|
|
40
|
+
}
|
|
41
|
+
export interface FileResult {
|
|
42
|
+
file: string;
|
|
43
|
+
name: string;
|
|
44
|
+
cases: number;
|
|
45
|
+
passed: number;
|
|
46
|
+
failed: number;
|
|
47
|
+
}
|
|
48
|
+
export interface EvalSummary {
|
|
49
|
+
timestamp: string;
|
|
50
|
+
agent: string;
|
|
51
|
+
model: string;
|
|
52
|
+
totalCases: number;
|
|
53
|
+
passed: number;
|
|
54
|
+
failed: number;
|
|
55
|
+
averageScore: number | null;
|
|
56
|
+
duration: number;
|
|
57
|
+
files: FileResult[];
|
|
58
|
+
}
|
|
59
|
+
export interface EvalDetails {
|
|
60
|
+
results: CaseResult[];
|
|
61
|
+
}
|
|
62
|
+
export interface EvalOptions {
|
|
63
|
+
file?: string;
|
|
64
|
+
tag?: string;
|
|
65
|
+
save?: boolean;
|
|
66
|
+
noJudge?: boolean;
|
|
67
|
+
}
|