@archon-claw/cli 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/agent.d.ts +2 -0
  2. package/dist/agent.js +152 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +141 -0
  5. package/dist/config.d.ts +2 -0
  6. package/dist/config.js +161 -0
  7. package/dist/eval/assertions.d.ts +9 -0
  8. package/dist/eval/assertions.js +137 -0
  9. package/dist/eval/execute.d.ts +13 -0
  10. package/dist/eval/execute.js +260 -0
  11. package/dist/eval/formatter.d.ts +10 -0
  12. package/dist/eval/formatter.js +62 -0
  13. package/dist/eval/judge.d.ts +7 -0
  14. package/dist/eval/judge.js +116 -0
  15. package/dist/eval/runner.d.ts +9 -0
  16. package/dist/eval/runner.js +156 -0
  17. package/dist/eval/types.d.ts +67 -0
  18. package/dist/eval/types.js +1 -0
  19. package/dist/llm.d.ts +7 -0
  20. package/dist/llm.js +52 -0
  21. package/dist/mcp-manager.d.ts +51 -0
  22. package/dist/mcp-manager.js +268 -0
  23. package/dist/pending-tool-results.d.ts +4 -0
  24. package/dist/pending-tool-results.js +39 -0
  25. package/dist/public/assets/chat-input-BBnVJs9h.js +151 -0
  26. package/dist/public/assets/chat-input-CISJdhF2.css +1 -0
  27. package/dist/public/assets/embed-DhIUBDdf.js +1 -0
  28. package/dist/public/assets/main-Bfvj6DnV.js +16 -0
  29. package/dist/public/embed/widget.js +233 -0
  30. package/dist/public/embed.html +14 -0
  31. package/dist/public/index.html +14 -0
  32. package/dist/scaffold.d.ts +2 -0
  33. package/dist/scaffold.js +82 -0
  34. package/dist/schemas.d.ts +899 -0
  35. package/dist/schemas.js +134 -0
  36. package/dist/server.d.ts +3 -0
  37. package/dist/server.js +258 -0
  38. package/dist/session.d.ts +8 -0
  39. package/dist/session.js +70 -0
  40. package/dist/templates/agent/model.json +6 -0
  41. package/dist/templates/agent/system-prompt.md +9 -0
  42. package/dist/templates/agent/tool-impls/greeting.impl.js +9 -0
  43. package/dist/templates/agent/tools/greeting.json +14 -0
  44. package/dist/templates/workspace/.claude/skills/create-agent/SKILL.md +90 -0
  45. package/dist/templates/workspace/.claude/skills/create-dataset/SKILL.md +57 -0
  46. package/dist/templates/workspace/.claude/skills/create-eval-case/SKILL.md +159 -0
  47. package/dist/templates/workspace/.claude/skills/create-eval-judge/SKILL.md +128 -0
  48. package/dist/templates/workspace/.claude/skills/create-mcp-config/SKILL.md +151 -0
  49. package/dist/templates/workspace/.claude/skills/create-model-config/SKILL.md +45 -0
  50. package/dist/templates/workspace/.claude/skills/create-skill/SKILL.md +63 -0
  51. package/dist/templates/workspace/.claude/skills/create-system-prompt/SKILL.md +168 -0
  52. package/dist/templates/workspace/.claude/skills/create-tool/SKILL.md +56 -0
  53. package/dist/templates/workspace/.claude/skills/create-tool-impl/SKILL.md +83 -0
  54. package/dist/templates/workspace/.claude/skills/create-tool-test/SKILL.md +117 -0
  55. package/dist/templates/workspace/.claude/skills/create-tool-ui/SKILL.md +218 -0
  56. package/dist/test-runner.d.ts +22 -0
  57. package/dist/test-runner.js +166 -0
  58. package/dist/types.d.ts +75 -0
  59. package/dist/types.js +1 -0
  60. package/dist/validator/index.d.ts +16 -0
  61. package/dist/validator/index.js +54 -0
  62. package/dist/validator/plugin.d.ts +21 -0
  63. package/dist/validator/plugin.js +1 -0
  64. package/dist/validator/plugins/agent-dir.d.ts +2 -0
  65. package/dist/validator/plugins/agent-dir.js +171 -0
  66. package/dist/validator/plugins/agent-skill.d.ts +2 -0
  67. package/dist/validator/plugins/agent-skill.js +31 -0
  68. package/dist/validator/plugins/dataset.d.ts +2 -0
  69. package/dist/validator/plugins/dataset.js +20 -0
  70. package/dist/validator/plugins/mcp.d.ts +2 -0
  71. package/dist/validator/plugins/mcp.js +20 -0
  72. package/dist/validator/plugins/model.d.ts +2 -0
  73. package/dist/validator/plugins/model.js +20 -0
  74. package/dist/validator/plugins/system-prompt.d.ts +2 -0
  75. package/dist/validator/plugins/system-prompt.js +25 -0
  76. package/dist/validator/plugins/tool.d.ts +2 -0
  77. package/dist/validator/plugins/tool.js +20 -0
  78. package/dist/validator/zod-utils.d.ts +3 -0
  79. package/dist/validator/zod-utils.js +7 -0
  80. package/package.json +41 -0
@@ -0,0 +1,260 @@
1
+ import { createClient, toOpenAITools, streamChat } from "../llm.js";
2
+ import { runAssertions } from "./assertions.js";
3
+ const MAX_ITERATIONS = 20;
4
+ /**
5
+ * Execute a single eval case, dispatching to the appropriate mode.
6
+ */
7
+ export async function executeCase(opts) {
8
+ const start = Date.now();
9
+ switch (opts.evalCase.mode) {
10
+ case "single":
11
+ return executeSingle(opts, start);
12
+ case "injected":
13
+ return executeInjected(opts, start);
14
+ case "sequential":
15
+ return executeSequential(opts, start);
16
+ default:
17
+ return {
18
+ file: opts.fileName,
19
+ case: opts.evalCase.name,
20
+ mode: opts.evalCase.mode,
21
+ passed: false,
22
+ duration: Date.now() - start,
23
+ response: "",
24
+ toolCalls: [],
25
+ assertionResults: [
26
+ {
27
+ type: "contains",
28
+ value: "",
29
+ passed: false,
30
+ message: `unknown mode: ${opts.evalCase.mode}`,
31
+ },
32
+ ],
33
+ };
34
+ }
35
+ }
36
+ /**
37
+ * Collect text and tool calls from one LLM generation cycle (may include
38
+ * multiple tool-call loops).
39
+ */
40
+ async function runGeneration(config, messages, toolSubset) {
41
+ const client = createClient(config.model);
42
+ // Filter tools if a subset is specified
43
+ let tools = config.tools;
44
+ if (toolSubset) {
45
+ tools = config.tools.filter((t) => toolSubset.includes(t.name));
46
+ }
47
+ const openAITools = toOpenAITools(tools);
48
+ let fullText = "";
49
+ const allToolCalls = [];
50
+ for (let i = 0; i < MAX_ITERATIONS; i++) {
51
+ const stream = streamChat(client, config.model, messages, openAITools);
52
+ let textContent = "";
53
+ const toolCallsMap = new Map();
54
+ for await (const chunk of stream) {
55
+ const delta = chunk.choices[0]?.delta;
56
+ if (!delta)
57
+ continue;
58
+ if (delta.content) {
59
+ textContent += delta.content;
60
+ }
61
+ if (delta.tool_calls) {
62
+ for (const tc of delta.tool_calls) {
63
+ const existing = toolCallsMap.get(tc.index);
64
+ if (existing) {
65
+ if (tc.function?.arguments) {
66
+ existing.args += tc.function.arguments;
67
+ }
68
+ }
69
+ else {
70
+ toolCallsMap.set(tc.index, {
71
+ id: tc.id ?? "",
72
+ name: tc.function?.name ?? "",
73
+ args: tc.function?.arguments ?? "",
74
+ });
75
+ }
76
+ }
77
+ }
78
+ }
79
+ // Build assistant message
80
+ const assistantToolCalls = [...toolCallsMap.values()].map((tc) => ({
81
+ id: tc.id,
82
+ type: "function",
83
+ function: { name: tc.name, arguments: tc.args },
84
+ }));
85
+ const assistantMessage = assistantToolCalls.length > 0
86
+ ? { role: "assistant", content: textContent || null, tool_calls: assistantToolCalls }
87
+ : { role: "assistant", content: textContent };
88
+ messages.push(assistantMessage);
89
+ fullText += textContent;
90
+ // No tool calls — done
91
+ if (assistantToolCalls.length === 0)
92
+ break;
93
+ // Execute tool calls
94
+ for (const tc of assistantToolCalls) {
95
+ const args = JSON.parse(tc.function.arguments);
96
+ const impl = config.toolImpls.get(tc.function.name);
97
+ let result;
98
+ if (impl) {
99
+ try {
100
+ result = await impl(args);
101
+ }
102
+ catch (err) {
103
+ result = { error: err instanceof Error ? err.message : String(err) };
104
+ }
105
+ }
106
+ else {
107
+ result = { error: `Unknown tool: ${tc.function.name}` };
108
+ }
109
+ allToolCalls.push({ name: tc.function.name, args, result });
110
+ messages.push({
111
+ role: "tool",
112
+ tool_call_id: tc.id,
113
+ content: JSON.stringify(result),
114
+ });
115
+ }
116
+ }
117
+ return { text: fullText, toolCalls: allToolCalls, messages };
118
+ }
119
+ // ---- Mode 1: Single ----
120
+ async function executeSingle(opts, start) {
121
+ const { config, evalCase, fileName, judgeRunner } = opts;
122
+ const userTurn = evalCase.turns[0];
123
+ const messages = [
124
+ { role: "system", content: config.systemPrompt },
125
+ { role: "user", content: userTurn.content },
126
+ ];
127
+ const { text, toolCalls } = await runGeneration(config, messages, evalCase.tools);
128
+ const assertionResults = runAssertions(evalCase.assertions ?? [], text, toolCalls);
129
+ const passed = assertionResults.every((a) => a.passed);
130
+ let judgeResult;
131
+ if (judgeRunner) {
132
+ judgeResult = await judgeRunner(userTurn.content, evalCase.expectedOutput, text, messages);
133
+ }
134
+ return {
135
+ file: fileName,
136
+ case: evalCase.name,
137
+ mode: "single",
138
+ passed,
139
+ duration: Date.now() - start,
140
+ response: text,
141
+ toolCalls,
142
+ assertionResults,
143
+ judgeResult,
144
+ };
145
+ }
146
+ // ---- Mode 2: Injected ----
147
+ async function executeInjected(opts, start) {
148
+ const { config, evalCase, fileName, judgeRunner } = opts;
149
+ const messages = [
150
+ { role: "system", content: config.systemPrompt },
151
+ ];
152
+ // Inject all turns except the last user turn as history
153
+ for (const turn of evalCase.turns.slice(0, -1)) {
154
+ if (turn.role === "user") {
155
+ messages.push({ role: "user", content: turn.content });
156
+ }
157
+ else if (turn.role === "assistant") {
158
+ const msg = { role: "assistant", content: turn.content };
159
+ if (turn.toolCalls && turn.toolCalls.length > 0) {
160
+ const tcs = turn.toolCalls.map((tc, idx) => ({
161
+ id: `injected_${idx}`,
162
+ type: "function",
163
+ function: { name: tc.name, arguments: JSON.stringify(tc.args) },
164
+ }));
165
+ msg.tool_calls = tcs;
166
+ // Add tool result messages
167
+ for (const tc of turn.toolCalls) {
168
+ messages.push({
169
+ role: "tool",
170
+ tool_call_id: `injected_${turn.toolCalls.indexOf(tc)}`,
171
+ content: JSON.stringify(tc.result),
172
+ });
173
+ }
174
+ }
175
+ messages.push(msg);
176
+ }
177
+ }
178
+ // Add the last user turn (triggers generation)
179
+ const lastTurn = evalCase.turns[evalCase.turns.length - 1];
180
+ messages.push({ role: "user", content: lastTurn.content });
181
+ const { text, toolCalls } = await runGeneration(config, messages, evalCase.tools);
182
+ const assertionResults = runAssertions(evalCase.assertions ?? [], text, toolCalls);
183
+ const passed = assertionResults.every((a) => a.passed);
184
+ let judgeResult;
185
+ if (judgeRunner) {
186
+ judgeResult = await judgeRunner(lastTurn.content, evalCase.expectedOutput, text, messages);
187
+ }
188
+ return {
189
+ file: fileName,
190
+ case: evalCase.name,
191
+ mode: "injected",
192
+ passed,
193
+ duration: Date.now() - start,
194
+ response: text,
195
+ toolCalls,
196
+ assertionResults,
197
+ judgeResult,
198
+ };
199
+ }
200
+ // ---- Mode 3: Sequential ----
201
+ async function executeSequential(opts, start) {
202
+ const { config, evalCase, fileName, judgeRunner } = opts;
203
+ const messages = [
204
+ { role: "system", content: config.systemPrompt },
205
+ ];
206
+ const allToolCalls = [];
207
+ const turnResults = [];
208
+ let lastResponse = "";
209
+ let allPassed = true;
210
+ for (const turn of evalCase.turns) {
211
+ if (turn.role !== "user")
212
+ continue;
213
+ messages.push({ role: "user", content: turn.content });
214
+ const { text, toolCalls } = await runGeneration(config, messages, evalCase.tools);
215
+ lastResponse = text;
216
+ allToolCalls.push(...toolCalls);
217
+ // Per-turn assertions
218
+ let turnAssertionResults;
219
+ if (turn.assertions && turn.assertions.length > 0) {
220
+ turnAssertionResults = runAssertions(turn.assertions, text, toolCalls);
221
+ if (turnAssertionResults.some((a) => !a.passed)) {
222
+ allPassed = false;
223
+ }
224
+ }
225
+ // Per-turn judge
226
+ let turnJudgeResult;
227
+ if (judgeRunner && turn.judge) {
228
+ turnJudgeResult = await judgeRunner(turn.content, turn.expectedOutput, text, messages);
229
+ }
230
+ turnResults.push({
231
+ role: "assistant",
232
+ content: text,
233
+ toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
234
+ assertionResults: turnAssertionResults,
235
+ judgeResult: turnJudgeResult,
236
+ });
237
+ }
238
+ // Case-level assertions on the final response
239
+ const caseAssertionResults = runAssertions(evalCase.assertions ?? [], lastResponse, allToolCalls);
240
+ if (caseAssertionResults.some((a) => !a.passed)) {
241
+ allPassed = false;
242
+ }
243
+ // Case-level judge
244
+ let judgeResult;
245
+ if (judgeRunner) {
246
+ judgeResult = await judgeRunner(evalCase.turns.filter((t) => t.role === "user").map((t) => t.content).join("\n"), evalCase.expectedOutput, lastResponse, messages);
247
+ }
248
+ return {
249
+ file: fileName,
250
+ case: evalCase.name,
251
+ mode: "sequential",
252
+ passed: allPassed,
253
+ duration: Date.now() - start,
254
+ response: lastResponse,
255
+ turnResults,
256
+ toolCalls: allToolCalls,
257
+ assertionResults: caseAssertionResults,
258
+ judgeResult,
259
+ };
260
+ }
@@ -0,0 +1,10 @@
1
+ import type { CaseResult, EvalFile } from "./types.js";
2
+ interface LoadedEvalFile {
3
+ fileName: string;
4
+ data: EvalFile;
5
+ }
6
+ /**
7
+ * Format eval results for console output.
8
+ */
9
+ export declare function formatEvalResults(evalFiles: LoadedEvalFile[], results: CaseResult[]): string;
10
+ export {};
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Format eval results for console output.
3
+ */
4
+ export function formatEvalResults(evalFiles, results) {
5
+ const lines = [];
6
+ for (const ef of evalFiles) {
7
+ lines.push(`\n eval: ${ef.data.name} (${ef.fileName})\n`);
8
+ const fileResults = results.filter((r) => r.file === ef.fileName);
9
+ for (const r of fileResults) {
10
+ const icon = r.passed ? "\u2713" : "\u2717";
11
+ const durationStr = `${r.duration}ms`;
12
+ lines.push(` ${icon} ${r.case} (${r.mode})${padRight(durationStr, 40, r.case.length + r.mode.length + 7)}`);
13
+ // Show assertion details for failed cases or when there are assertions
14
+ if (!r.passed || r.assertionResults.length > 0) {
15
+ for (const a of r.assertionResults) {
16
+ const aIcon = a.passed ? "\u2713" : "\u2717";
17
+ const detail = a.message ? ` (${a.message})` : "";
18
+ lines.push(` - ${a.type} "${a.value}"${padRight(aIcon + detail, 30, a.type.length + a.value.length + 5)}`);
19
+ }
20
+ }
21
+ // Show turn assertion details for sequential mode
22
+ if (r.turnResults) {
23
+ for (const tr of r.turnResults) {
24
+ if (tr.assertionResults) {
25
+ for (const a of tr.assertionResults) {
26
+ if (!a.passed) {
27
+ const detail = a.message ? ` (${a.message})` : "";
28
+ lines.push(` - ${a.type} "${a.value}" \u2717${detail}`);
29
+ }
30
+ }
31
+ }
32
+ }
33
+ }
34
+ // Show judge result if available
35
+ if (r.judgeResult) {
36
+ const scores = Object.entries(r.judgeResult.scores)
37
+ .map(([key, s]) => `${key}=${s.score}`)
38
+ .join(" ");
39
+ lines.push(` - judge: ${scores} avg=${r.judgeResult.overallScore}`);
40
+ }
41
+ }
42
+ }
43
+ // Summary
44
+ const totalCases = results.length;
45
+ const passed = results.filter((r) => r.passed).length;
46
+ const failed = totalCases - passed;
47
+ lines.push("");
48
+ lines.push(` ${totalCases} cases, ${passed} passed, ${failed} failed`);
49
+ // Average judge score
50
+ const judgeScores = results
51
+ .filter((r) => r.judgeResult)
52
+ .map((r) => r.judgeResult.overallScore);
53
+ if (judgeScores.length > 0) {
54
+ const avg = Math.round((judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length) * 10) / 10;
55
+ lines.push(` Average judge score: ${avg} / 10`);
56
+ }
57
+ return lines.join("\n");
58
+ }
59
+ function padRight(suffix, targetGap, contentLength) {
60
+ const gap = Math.max(2, targetGap - contentLength);
61
+ return " ".repeat(gap) + suffix;
62
+ }
@@ -0,0 +1,7 @@
1
+ import type { ChatMessage, ModelConfig } from "../types.js";
2
+ import type { JudgeConfig, JudgeResult } from "./types.js";
3
+ export type JudgeRunnerFn = (userInput: string, expectedOutput: string | undefined, actualResponse: string, conversation: ChatMessage[]) => Promise<JudgeResult>;
4
+ /**
5
+ * Create a judge runner function from a JudgeConfig.
6
+ */
7
+ export declare function createJudgeRunner(judgeConfig: JudgeConfig, fallbackModel: ModelConfig): JudgeRunnerFn;
@@ -0,0 +1,116 @@
1
+ import { Liquid } from "liquidjs";
2
+ import { createClient } from "../llm.js";
3
+ const DEFAULT_PROMPT_TEMPLATE = `请评估以下 AI 助手的回复质量。
4
+
5
+ 用户输入:{{ user_input }}
6
+ {% if expected_output %}期望输出:{{ expected_output }}{% endif %}
7
+ 实际回复:{{ actual_response }}
8
+
9
+ 请对以下每个维度独立评估,并给出简短理由。
10
+ {% for dim in dimensions %}
11
+ {% if dim.type == "binary" %}- {{ dim.label }}({{ dim.key }}):通过 true / 不通过 false
12
+ {% else %}- {{ dim.label }}({{ dim.key }}):{{ dim.min }} - {{ dim.max }} 分
13
+ {% endif %}{% endfor %}
14
+
15
+ 请严格以下面的 JSON 格式回复,不要输出其他内容:
16
+ {
17
+ {% for dim in dimensions %}{% if dim.type == "binary" %} "{{ dim.key }}": { "score": <true 或 false>, "reason": "<理由>" }{% else %} "{{ dim.key }}": { "score": <分数>, "reason": "<理由>" }{% endif %}{% unless forloop.last %},{% endunless %}
18
+ {% endfor %}}`;
19
+ /**
20
+ * Create a judge runner function from a JudgeConfig.
21
+ */
22
+ export function createJudgeRunner(judgeConfig, fallbackModel) {
23
+ const model = judgeConfig.model ?? fallbackModel;
24
+ const client = createClient(model);
25
+ const engine = new Liquid();
26
+ return async (userInput, expectedOutput, actualResponse, conversation) => {
27
+ const template = judgeConfig.promptTemplate ?? DEFAULT_PROMPT_TEMPLATE;
28
+ const prompt = await engine.parseAndRender(template, {
29
+ user_input: userInput,
30
+ expected_output: expectedOutput ?? "",
31
+ actual_response: actualResponse,
32
+ conversation: conversation.map((m) => `${m.role}: ${m.content}`).join("\n"),
33
+ dimensions: judgeConfig.dimensions.map((d) => ({
34
+ ...d,
35
+ type: d.type ?? "numeric",
36
+ min: d.min ?? 0,
37
+ max: d.max ?? 10,
38
+ })),
39
+ });
40
+ const response = await client.chat.completions.create({
41
+ model: model.model,
42
+ messages: [{ role: "user", content: prompt }],
43
+ max_tokens: model.maxTokens ?? 2048,
44
+ temperature: 0.1,
45
+ });
46
+ const content = response.choices[0]?.message?.content ?? "";
47
+ return parseJudgeResponse(content, judgeConfig);
48
+ };
49
+ }
50
+ /**
51
+ * Parse the judge LLM response into structured scores.
52
+ */
53
+ function parseJudgeResponse(content, config) {
54
+ const scores = {};
55
+ try {
56
+ // Extract JSON from the response (may be wrapped in markdown code blocks)
57
+ const jsonMatch = content.match(/\{[\s\S]*\}/);
58
+ if (!jsonMatch) {
59
+ throw new Error("No JSON found in judge response");
60
+ }
61
+ const parsed = JSON.parse(jsonMatch[0]);
62
+ for (const dim of config.dimensions) {
63
+ const entry = parsed[dim.key];
64
+ if (!entry) {
65
+ scores[dim.key] = { score: 0, reason: "Failed to parse score" };
66
+ continue;
67
+ }
68
+ if ((dim.type ?? "numeric") === "binary") {
69
+ // Binary: convert boolean to 0/1
70
+ const val = typeof entry.score === "boolean" ? entry.score : !!entry.score;
71
+ scores[dim.key] = {
72
+ score: val ? 1 : 0,
73
+ reason: entry.reason ?? "",
74
+ };
75
+ }
76
+ else {
77
+ // Numeric
78
+ if (typeof entry.score === "number") {
79
+ scores[dim.key] = {
80
+ score: entry.score,
81
+ reason: entry.reason ?? "",
82
+ };
83
+ }
84
+ else {
85
+ scores[dim.key] = { score: 0, reason: "Failed to parse score" };
86
+ }
87
+ }
88
+ }
89
+ }
90
+ catch {
91
+ for (const dim of config.dimensions) {
92
+ scores[dim.key] = { score: 0, reason: "Failed to parse judge response" };
93
+ }
94
+ }
95
+ // Calculate weighted overall score
96
+ let totalWeight = 0;
97
+ let weightedSum = 0;
98
+ for (const dim of config.dimensions) {
99
+ const s = scores[dim.key];
100
+ if (s) {
101
+ if ((dim.type ?? "numeric") === "binary") {
102
+ // Binary: 0 or 1 → normalize to 0-10
103
+ weightedSum += s.score * 10 * dim.weight;
104
+ }
105
+ else {
106
+ const max = dim.max ?? 10;
107
+ const min = dim.min ?? 0;
108
+ const normalized = max !== min ? ((s.score - min) / (max - min)) * 10 : 0;
109
+ weightedSum += normalized * dim.weight;
110
+ }
111
+ totalWeight += dim.weight;
112
+ }
113
+ }
114
+ const overallScore = totalWeight > 0 ? Math.round((weightedSum / totalWeight) * 10) / 10 : 0;
115
+ return { scores, overallScore };
116
+ }
@@ -0,0 +1,9 @@
1
+ import type { EvalSummary, EvalDetails, EvalOptions } from "./types.js";
2
+ /**
3
+ * Run all eval files for an agent directory.
4
+ */
5
+ export declare function runEvals(agentDir: string, options?: EvalOptions): Promise<{
6
+ summary: EvalSummary;
7
+ details: EvalDetails;
8
+ formatted: string;
9
+ }>;
@@ -0,0 +1,156 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { loadAgentConfig } from "../config.js";
4
+ import { executeCase } from "./execute.js";
5
+ import { createJudgeRunner } from "./judge.js";
6
+ import { formatEvalResults } from "./formatter.js";
7
+ /**
8
+ * Run all eval files for an agent directory.
9
+ */
10
+ export async function runEvals(agentDir, options = {}) {
11
+ const absDir = path.resolve(agentDir);
12
+ const casesDir = path.join(absDir, "eval-cases");
13
+ const judgesDir = path.join(absDir, "eval-judges");
14
+ // Load agent config
15
+ const config = await loadAgentConfig(agentDir);
16
+ // Load eval files from eval-cases/
17
+ let evalFiles = await loadEvalFiles(casesDir);
18
+ // Filter by file
19
+ if (options.file) {
20
+ evalFiles = evalFiles.filter((f) => f.fileName === options.file);
21
+ if (evalFiles.length === 0) {
22
+ throw new Error(`Eval file not found: ${options.file}`);
23
+ }
24
+ }
25
+ // Filter cases by tag
26
+ if (options.tag) {
27
+ for (const ef of evalFiles) {
28
+ ef.data.cases = ef.data.cases.filter((c) => c.tags && c.tags.includes(options.tag));
29
+ }
30
+ evalFiles = evalFiles.filter((ef) => ef.data.cases.length > 0);
31
+ }
32
+ if (evalFiles.length === 0) {
33
+ throw new Error("No eval cases found");
34
+ }
35
+ // Load judge configs from eval-judges/ (optional)
36
+ let judgeRunners;
37
+ if (!options.noJudge) {
38
+ const judgeConfigs = await loadJudgeConfigs(judgesDir);
39
+ if (judgeConfigs.size > 0) {
40
+ judgeRunners = new Map();
41
+ for (const [name, judgeConfig] of judgeConfigs) {
42
+ judgeRunners.set(name, createJudgeRunner(judgeConfig, config.model));
43
+ }
44
+ }
45
+ }
46
+ // Run all cases
47
+ const results = [];
48
+ const startTime = Date.now();
49
+ for (const ef of evalFiles) {
50
+ for (const evalCase of ef.data.cases) {
51
+ // Resolve judge runner for this case
52
+ let judgeRunner;
53
+ if (judgeRunners) {
54
+ const judgeName = evalCase.judge ?? "default";
55
+ judgeRunner = judgeRunners.get(judgeName);
56
+ }
57
+ const result = await executeCase({
58
+ config,
59
+ evalCase,
60
+ fileName: ef.fileName,
61
+ judgeRunner,
62
+ });
63
+ results.push(result);
64
+ }
65
+ }
66
+ const totalDuration = Date.now() - startTime;
67
+ // Build summary
68
+ const summary = buildSummary(results, absDir, config, totalDuration, evalFiles);
69
+ const details = { results };
70
+ // Save results if requested
71
+ if (options.save) {
72
+ await saveResults(absDir, summary, details);
73
+ }
74
+ const formatted = formatEvalResults(evalFiles, results);
75
+ return { summary, details, formatted };
76
+ }
77
+ async function loadEvalFiles(casesDir) {
78
+ let files;
79
+ try {
80
+ files = await fs.readdir(casesDir);
81
+ }
82
+ catch {
83
+ throw new Error(`No eval-cases/ directory found`);
84
+ }
85
+ const evalFiles = files.filter((f) => f.endsWith(".eval.json"));
86
+ if (evalFiles.length === 0) {
87
+ throw new Error("No .eval.json files found in eval-cases/");
88
+ }
89
+ const loaded = [];
90
+ for (const file of evalFiles) {
91
+ const content = await fs.readFile(path.join(casesDir, file), "utf-8");
92
+ const parsed = JSON.parse(content);
93
+ loaded.push({ fileName: file, data: parsed });
94
+ }
95
+ return loaded;
96
+ }
97
+ async function loadJudgeConfigs(judgesDir) {
98
+ const configs = new Map();
99
+ let files;
100
+ try {
101
+ files = await fs.readdir(judgesDir);
102
+ }
103
+ catch {
104
+ return configs; // eval-judges/ is optional
105
+ }
106
+ const jsonFiles = files.filter((f) => f.endsWith(".json"));
107
+ for (const file of jsonFiles) {
108
+ const name = file.replace(/\.json$/, "");
109
+ const content = await fs.readFile(path.join(judgesDir, file), "utf-8");
110
+ const parsed = JSON.parse(content);
111
+ configs.set(name, parsed);
112
+ }
113
+ return configs;
114
+ }
115
+ function buildSummary(results, absDir, config, totalDuration, evalFiles) {
116
+ const passed = results.filter((r) => r.passed).length;
117
+ const failed = results.length - passed;
118
+ // Average judge score
119
+ const judgeScores = results
120
+ .filter((r) => r.judgeResult)
121
+ .map((r) => r.judgeResult.overallScore);
122
+ const averageScore = judgeScores.length > 0
123
+ ? Math.round((judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length) * 10) / 10
124
+ : null;
125
+ // Per-file stats
126
+ const fileResults = evalFiles.map((ef) => {
127
+ const fileResults = results.filter((r) => r.file === ef.fileName);
128
+ const filePassed = fileResults.filter((r) => r.passed).length;
129
+ return {
130
+ file: ef.fileName,
131
+ name: ef.data.name,
132
+ cases: fileResults.length,
133
+ passed: filePassed,
134
+ failed: fileResults.length - filePassed,
135
+ };
136
+ });
137
+ return {
138
+ timestamp: new Date().toISOString(),
139
+ agent: path.basename(absDir),
140
+ model: `${config.model.provider}/${config.model.model}`,
141
+ totalCases: results.length,
142
+ passed,
143
+ failed,
144
+ averageScore,
145
+ duration: totalDuration,
146
+ files: fileResults,
147
+ };
148
+ }
149
+ async function saveResults(absDir, summary, details) {
150
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19);
151
+ const resultsDir = path.join(absDir, "eval-results", timestamp);
152
+ await fs.mkdir(resultsDir, { recursive: true });
153
+ await fs.writeFile(path.join(resultsDir, "summary.json"), JSON.stringify(summary, null, 2));
154
+ await fs.writeFile(path.join(resultsDir, "details.json"), JSON.stringify(details, null, 2));
155
+ return resultsDir;
156
+ }
@@ -0,0 +1,67 @@
1
+ export type { AssertionType, Assertion, EvalTurnToolCall, EvalTurn, EvalMode, EvalCase, EvalFile, JudgeDimension, JudgeConfig, } from "../schemas.js";
2
+ import type { AssertionType, EvalMode } from "../schemas.js";
3
+ export interface AssertionResult {
4
+ type: AssertionType;
5
+ value: string;
6
+ passed: boolean;
7
+ message?: string;
8
+ }
9
+ export interface JudgeScore {
10
+ score: number;
11
+ reason: string;
12
+ }
13
+ export interface JudgeResult {
14
+ scores: Record<string, JudgeScore>;
15
+ overallScore: number;
16
+ }
17
+ export interface ToolCallRecord {
18
+ name: string;
19
+ args: Record<string, unknown>;
20
+ result: unknown;
21
+ }
22
+ export interface TurnResult {
23
+ role: "user" | "assistant";
24
+ content: string;
25
+ toolCalls?: ToolCallRecord[];
26
+ assertionResults?: AssertionResult[];
27
+ judgeResult?: JudgeResult;
28
+ }
29
+ export interface CaseResult {
30
+ file: string;
31
+ case: string;
32
+ mode: EvalMode;
33
+ passed: boolean;
34
+ duration: number;
35
+ response: string;
36
+ turnResults?: TurnResult[];
37
+ toolCalls: ToolCallRecord[];
38
+ assertionResults: AssertionResult[];
39
+ judgeResult?: JudgeResult;
40
+ }
41
+ export interface FileResult {
42
+ file: string;
43
+ name: string;
44
+ cases: number;
45
+ passed: number;
46
+ failed: number;
47
+ }
48
+ export interface EvalSummary {
49
+ timestamp: string;
50
+ agent: string;
51
+ model: string;
52
+ totalCases: number;
53
+ passed: number;
54
+ failed: number;
55
+ averageScore: number | null;
56
+ duration: number;
57
+ files: FileResult[];
58
+ }
59
+ export interface EvalDetails {
60
+ results: CaseResult[];
61
+ }
62
+ export interface EvalOptions {
63
+ file?: string;
64
+ tag?: string;
65
+ save?: boolean;
66
+ noJudge?: boolean;
67
+ }