@aroha-sdk/evals 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +31 -0
- package/src/cli.ts +105 -0
- package/src/index.ts +13 -0
- package/src/judge.ts +80 -0
- package/src/runner.ts +214 -0
- package/src/types.ts +93 -0
package/package.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@aroha-sdk/evals",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "Evaluation harness for Aroha agents — golden-dataset runner, LLM-as-judge, trajectory scoring",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"aroha-evals": "./dist/cli.js"
|
|
10
|
+
},
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./dist/index.js",
|
|
14
|
+
"types": "./dist/index.d.ts"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"files": ["dist", "src"],
|
|
18
|
+
"scripts": {
|
|
19
|
+
"build": "tsc -p tsconfig.json",
|
|
20
|
+
"test": "vitest run --passWithNoTests",
|
|
21
|
+
"dev": "tsc -p tsconfig.json --watch"
|
|
22
|
+
},
|
|
23
|
+
"engines": { "node": ">=22" },
|
|
24
|
+
"license": "MIT",
|
|
25
|
+
"keywords": ["aroha", "agent", "evals", "llm-as-judge", "evaluation"],
|
|
26
|
+
"devDependencies": {
|
|
27
|
+
"typescript": "^5.4.5",
|
|
28
|
+
"vitest": "^1.6.0",
|
|
29
|
+
"@types/node": "^20.14.0"
|
|
30
|
+
}
|
|
31
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Copyright (c) 2026 Aroha Labs
|
|
3
|
+
// SPDX-License-Identifier: MIT
|
|
4
|
+
/**
|
|
5
|
+
* aroha-evals CLI
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* npx @aroha-sdk/evals run --endpoint http://localhost:8000 --suite ./evals.json
|
|
9
|
+
* npx @aroha-sdk/evals run --endpoint https://my-agent.fly.dev --bearer $TOKEN --judge-model gpt-4o
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { readFileSync } from "node:fs";
|
|
13
|
+
import { runEvals, postEvalsToReputation } from "./runner.js";
|
|
14
|
+
import type { EvalSuite, RunEvalsOptions } from "./types.js";
|
|
15
|
+
|
|
16
|
+
const args = process.argv.slice(2);
|
|
17
|
+
const cmd = args[0];
|
|
18
|
+
|
|
19
|
+
function flag(name: string): string | undefined {
|
|
20
|
+
const idx = args.indexOf(`--${name}`);
|
|
21
|
+
return idx !== -1 ? args[idx + 1] : undefined;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (cmd !== "run") {
|
|
25
|
+
console.error(`Usage: aroha-evals run --endpoint <url> --suite <path> [--bearer <token>]`);
|
|
26
|
+
console.error(` [--judge-url <url>] [--judge-key <key>] [--judge-model <model>]`);
|
|
27
|
+
console.error(` [--threshold <0-1>] [--concurrency <n>]`);
|
|
28
|
+
console.error(` [--did <didHash>] [--reputation-url <url>] [--reputation-key <key>]`);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const endpoint = flag("endpoint");
|
|
33
|
+
const suitePath = flag("suite");
|
|
34
|
+
|
|
35
|
+
if (!endpoint || !suitePath) {
|
|
36
|
+
console.error("--endpoint and --suite are required");
|
|
37
|
+
process.exit(1);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
let suite: EvalSuite;
|
|
41
|
+
try {
|
|
42
|
+
suite = JSON.parse(readFileSync(suitePath, "utf8")) as EvalSuite;
|
|
43
|
+
} catch (err) {
|
|
44
|
+
console.error(`Failed to read suite: ${err}`);
|
|
45
|
+
process.exit(1);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const opts: RunEvalsOptions = {
|
|
49
|
+
endpoint,
|
|
50
|
+
bearerToken: flag("bearer"),
|
|
51
|
+
passThreshold: flag("threshold") ? parseFloat(flag("threshold")!) : undefined,
|
|
52
|
+
concurrency: flag("concurrency") ? parseInt(flag("concurrency")!, 10) : undefined,
|
|
53
|
+
timeoutMs: flag("timeout") ? parseInt(flag("timeout")!, 10) : undefined,
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
const judgeUrl = flag("judge-url") ?? flag("judge-api-url");
|
|
57
|
+
const judgeKey = flag("judge-key") ?? flag("judge-api-key");
|
|
58
|
+
const judgeModel = flag("judge-model") ?? "gpt-4o-mini";
|
|
59
|
+
|
|
60
|
+
if (judgeUrl && judgeKey) {
|
|
61
|
+
opts.judge = { apiUrl: judgeUrl, apiKey: judgeKey, model: judgeModel };
|
|
62
|
+
} else if (process.env.OPENAI_API_KEY) {
|
|
63
|
+
opts.judge = {
|
|
64
|
+
apiUrl: "https://api.openai.com/v1",
|
|
65
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
66
|
+
model: judgeModel,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
console.log(`\nRunning eval suite "${suite.name}" against ${endpoint}`);
|
|
71
|
+
console.log(`Cases: ${suite.cases.length} Concurrency: ${opts.concurrency ?? 4}\n`);
|
|
72
|
+
|
|
73
|
+
const report = await runEvals(suite, opts);
|
|
74
|
+
|
|
75
|
+
for (const r of report.results) {
|
|
76
|
+
const icon = r.outcome === "pass" ? "✓" : r.outcome === "error" ? "✗" : "✗";
|
|
77
|
+
const score = r.judgeScore !== undefined ? ` [judge: ${(r.judgeScore * 100).toFixed(0)}%]` : "";
|
|
78
|
+
console.log(` ${icon} ${r.caseName}${score} (${r.durationMs}ms)`);
|
|
79
|
+
if (r.outcome !== "pass" && r.error) console.log(` Error: ${r.error}`);
|
|
80
|
+
if (r.outcome !== "pass" && r.judgeReasoning) console.log(` Judge: ${r.judgeReasoning}`);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const { summary } = report;
|
|
84
|
+
console.log(`\n─────────────────────────────────────────`);
|
|
85
|
+
console.log(`Total: ${summary.total} Pass: ${summary.passed} Fail: ${summary.failed} Error: ${summary.errored}`);
|
|
86
|
+
if (summary.avgJudgeScore !== undefined)
|
|
87
|
+
console.log(`Avg judge score: ${(summary.avgJudgeScore * 100).toFixed(1)}%`);
|
|
88
|
+
if (summary.avgTrajectoryScore !== undefined)
|
|
89
|
+
console.log(`Avg trajectory score: ${(summary.avgTrajectoryScore * 100).toFixed(1)}%`);
|
|
90
|
+
console.log();
|
|
91
|
+
|
|
92
|
+
const reputationUrl = flag("reputation-url");
|
|
93
|
+
const reputationKey = flag("reputation-key");
|
|
94
|
+
const agentDid = flag("did");
|
|
95
|
+
|
|
96
|
+
if (reputationUrl && reputationKey && agentDid) {
|
|
97
|
+
try {
|
|
98
|
+
await postEvalsToReputation(agentDid, report, { reputationUrl, apiKey: reputationKey });
|
|
99
|
+
console.log(`✓ Scores posted to reputation service (${reputationUrl})\n`);
|
|
100
|
+
} catch (err) {
|
|
101
|
+
console.warn(`⚠ Failed to post scores to reputation service: ${err}\n`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
process.exit(summary.failed + summary.errored > 0 ? 1 : 0);
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
// Copyright (c) 2026 Aroha Labs
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
export { runEvals, postEvalsToReputation } from "./runner.js";
|
|
5
|
+
export { judge } from "./judge.js";
|
|
6
|
+
export type {
|
|
7
|
+
EvalCase,
|
|
8
|
+
EvalSuite,
|
|
9
|
+
EvalResult,
|
|
10
|
+
EvalReport,
|
|
11
|
+
EvalOutcome,
|
|
12
|
+
RunEvalsOptions,
|
|
13
|
+
} from "./types.js";
|
package/src/judge.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
// Copyright (c) 2026 Aroha Labs
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
interface JudgeInput {
|
|
5
|
+
input: string;
|
|
6
|
+
output: string;
|
|
7
|
+
expected?: string;
|
|
8
|
+
judge: {
|
|
9
|
+
apiUrl: string;
|
|
10
|
+
apiKey: string;
|
|
11
|
+
model: string;
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface JudgeResult {
|
|
16
|
+
score: number; // 0–1
|
|
17
|
+
reasoning: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const JUDGE_SYSTEM = `You are an impartial evaluator of AI agent responses.
|
|
21
|
+
Score the agent's output from 0.0 to 1.0 based on:
|
|
22
|
+
- Accuracy and correctness
|
|
23
|
+
- Completeness (did it address the full question?)
|
|
24
|
+
- Clarity and helpfulness
|
|
25
|
+
- Safety (no harmful, deceptive, or injected content)
|
|
26
|
+
|
|
27
|
+
Reply with ONLY a JSON object: {"score": <0.0-1.0>, "reasoning": "<one sentence>"}`;
|
|
28
|
+
|
|
29
|
+
function buildJudgePrompt(input: string, output: string, expected?: string): string {
|
|
30
|
+
let prompt = `User input: ${input}\n\nAgent output: ${output}`;
|
|
31
|
+
if (expected) prompt += `\n\nExpected output: ${expected}`;
|
|
32
|
+
return prompt;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export async function judge(opts: JudgeInput): Promise<JudgeResult> {
|
|
36
|
+
const { input, output, expected, judge: cfg } = opts;
|
|
37
|
+
|
|
38
|
+
const res = await fetch(`${cfg.apiUrl.replace(/\/$/, "")}/chat/completions`, {
|
|
39
|
+
method: "POST",
|
|
40
|
+
headers: {
|
|
41
|
+
"Content-Type": "application/json",
|
|
42
|
+
"Authorization": `Bearer ${cfg.apiKey}`,
|
|
43
|
+
},
|
|
44
|
+
body: JSON.stringify({
|
|
45
|
+
model: cfg.model,
|
|
46
|
+
messages: [
|
|
47
|
+
{ role: "system", content: JUDGE_SYSTEM },
|
|
48
|
+
{ role: "user", content: buildJudgePrompt(input, output, expected) },
|
|
49
|
+
],
|
|
50
|
+
temperature: 0,
|
|
51
|
+
max_tokens: 256,
|
|
52
|
+
}),
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
if (!res.ok) {
|
|
56
|
+
const text = await res.text();
|
|
57
|
+
throw new Error(`Judge API error ${res.status}: ${text}`);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const data = await res.json() as {
|
|
61
|
+
choices: Array<{ message: { content: string } }>;
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
const content = data.choices[0]?.message?.content ?? "";
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
const parsed = JSON.parse(content) as { score: number; reasoning: string };
|
|
68
|
+
return {
|
|
69
|
+
score: Math.max(0, Math.min(1, Number(parsed.score))),
|
|
70
|
+
reasoning: String(parsed.reasoning ?? ""),
|
|
71
|
+
};
|
|
72
|
+
} catch {
|
|
73
|
+
// Best-effort: try to extract a float from the response
|
|
74
|
+
const match = content.match(/(\d+(?:\.\d+)?)/);
|
|
75
|
+
return {
|
|
76
|
+
score: match ? Math.max(0, Math.min(1, parseFloat(match[1]))) : 0,
|
|
77
|
+
reasoning: content.slice(0, 200),
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
}
|
package/src/runner.ts
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
// Copyright (c) 2026 Aroha Labs
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
import type { EvalCase, EvalResult, EvalReport, EvalSuite, RunEvalsOptions } from "./types.js";
|
|
5
|
+
import { judge as llmJudge } from "./judge.js";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Post eval scores to an Aroha reputation endpoint so agent performance
|
|
9
|
+
* measurements feed the registry's trust signals.
|
|
10
|
+
*
|
|
11
|
+
* @param didHash Agent's didHash
|
|
12
|
+
* @param report Report from runEvals()
|
|
13
|
+
* @param opts Reputation service URL + auth
|
|
14
|
+
*/
|
|
15
|
+
export async function postEvalsToReputation(
|
|
16
|
+
didHash: string,
|
|
17
|
+
report: EvalReport,
|
|
18
|
+
opts: {
|
|
19
|
+
reputationUrl: string;
|
|
20
|
+
apiKey: string;
|
|
21
|
+
},
|
|
22
|
+
): Promise<void> {
|
|
23
|
+
const { summary } = report;
|
|
24
|
+
if (summary.total === 0) return;
|
|
25
|
+
|
|
26
|
+
const successRate = summary.passed / summary.total;
|
|
27
|
+
const qualityScore = summary.avgJudgeScore ?? successRate;
|
|
28
|
+
|
|
29
|
+
await fetch(`${opts.reputationUrl}/v1/signals`, {
|
|
30
|
+
method: "POST",
|
|
31
|
+
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${opts.apiKey}` },
|
|
32
|
+
body: JSON.stringify({
|
|
33
|
+
agentDID: didHash,
|
|
34
|
+
source: "evals",
|
|
35
|
+
successRate,
|
|
36
|
+
qualityScore,
|
|
37
|
+
sampleSize: summary.total,
|
|
38
|
+
reportedAt: report.runAt,
|
|
39
|
+
}),
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async function runCase(
|
|
44
|
+
evalCase: EvalCase,
|
|
45
|
+
opts: RunEvalsOptions,
|
|
46
|
+
): Promise<EvalResult> {
|
|
47
|
+
const start = Date.now();
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
const controller = new AbortController();
|
|
51
|
+
const timeout = setTimeout(() => controller.abort(), opts.timeoutMs ?? 30_000);
|
|
52
|
+
|
|
53
|
+
const res = await fetch(`${opts.endpoint.replace(/\/$/, "")}/v1/run`, {
|
|
54
|
+
method: "POST",
|
|
55
|
+
headers: {
|
|
56
|
+
"Content-Type": "application/json",
|
|
57
|
+
...(opts.bearerToken ? { Authorization: `Bearer ${opts.bearerToken}` } : {}),
|
|
58
|
+
},
|
|
59
|
+
body: JSON.stringify({ message: evalCase.input, stream: true }),
|
|
60
|
+
signal: controller.signal,
|
|
61
|
+
});
|
|
62
|
+
clearTimeout(timeout);
|
|
63
|
+
|
|
64
|
+
if (!res.ok) {
|
|
65
|
+
return {
|
|
66
|
+
caseName: evalCase.name,
|
|
67
|
+
outcome: "error",
|
|
68
|
+
input: evalCase.input,
|
|
69
|
+
output: "",
|
|
70
|
+
durationMs: Date.now() - start,
|
|
71
|
+
error: `HTTP ${res.status}: ${await res.text()}`,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Collect streaming events
|
|
76
|
+
let output = "";
|
|
77
|
+
const observedTrajectory: string[] = [];
|
|
78
|
+
|
|
79
|
+
const body = res.body;
|
|
80
|
+
if (body) {
|
|
81
|
+
const decoder = new TextDecoder();
|
|
82
|
+
const reader = body.getReader();
|
|
83
|
+
let buffer = "";
|
|
84
|
+
|
|
85
|
+
while (true) {
|
|
86
|
+
const { done, value } = await reader.read();
|
|
87
|
+
if (done) break;
|
|
88
|
+
buffer += decoder.decode(value, { stream: true });
|
|
89
|
+
const parts = buffer.split("\n\n");
|
|
90
|
+
buffer = parts.pop() ?? "";
|
|
91
|
+
|
|
92
|
+
for (const part of parts) {
|
|
93
|
+
const line = part.trim();
|
|
94
|
+
if (!line.startsWith("data: ")) continue;
|
|
95
|
+
try {
|
|
96
|
+
const event = JSON.parse(line.slice(6)) as { type: string; delta?: string; name?: string; message?: string };
|
|
97
|
+
if (event.type === "final") output = event.message ?? "";
|
|
98
|
+
if (event.type === "tool_call") observedTrajectory.push(event.name ?? "");
|
|
99
|
+
} catch { /* skip */ }
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const durationMs = Date.now() - start;
|
|
105
|
+
|
|
106
|
+
// Exact string match
|
|
107
|
+
let exactScore: number | undefined;
|
|
108
|
+
if (evalCase.expected !== undefined) {
|
|
109
|
+
const norm = (s: string) => s.trim().toLowerCase().replace(/\s+/g, " ");
|
|
110
|
+
exactScore = norm(output) === norm(evalCase.expected) ? 1 : 0;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Trajectory score
|
|
114
|
+
let trajectoryScore: number | undefined;
|
|
115
|
+
if (evalCase.expectedTrajectory?.length) {
|
|
116
|
+
const expected = evalCase.expectedTrajectory;
|
|
117
|
+
let matched = 0;
|
|
118
|
+
let obsIdx = 0;
|
|
119
|
+
for (const step of expected) {
|
|
120
|
+
while (obsIdx < observedTrajectory.length && observedTrajectory[obsIdx] !== step) obsIdx++;
|
|
121
|
+
if (obsIdx < observedTrajectory.length) { matched++; obsIdx++; }
|
|
122
|
+
}
|
|
123
|
+
trajectoryScore = matched / expected.length;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// LLM-as-judge
|
|
127
|
+
let judgeScore: number | undefined;
|
|
128
|
+
let judgeReasoning: string | undefined;
|
|
129
|
+
|
|
130
|
+
if (opts.judge) {
|
|
131
|
+
const judgment = await llmJudge({
|
|
132
|
+
input: evalCase.input,
|
|
133
|
+
output,
|
|
134
|
+
expected: evalCase.expected,
|
|
135
|
+
judge: opts.judge,
|
|
136
|
+
});
|
|
137
|
+
judgeScore = judgment.score;
|
|
138
|
+
judgeReasoning = judgment.reasoning;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Determine outcome
|
|
142
|
+
const threshold = opts.passThreshold ?? 0.7;
|
|
143
|
+
let outcome: EvalResult["outcome"] = "pass";
|
|
144
|
+
|
|
145
|
+
if (judgeScore !== undefined && judgeScore < threshold) outcome = "fail";
|
|
146
|
+
if (exactScore !== undefined && exactScore < 1 && judgeScore === undefined) outcome = "fail";
|
|
147
|
+
if (trajectoryScore !== undefined && trajectoryScore < threshold) outcome = "fail";
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
caseName: evalCase.name,
|
|
151
|
+
outcome,
|
|
152
|
+
input: evalCase.input,
|
|
153
|
+
output,
|
|
154
|
+
expected: evalCase.expected,
|
|
155
|
+
exactScore,
|
|
156
|
+
judgeScore,
|
|
157
|
+
judgeReasoning,
|
|
158
|
+
trajectoryScore,
|
|
159
|
+
observedTrajectory: observedTrajectory.length ? observedTrajectory : undefined,
|
|
160
|
+
durationMs,
|
|
161
|
+
};
|
|
162
|
+
} catch (err) {
|
|
163
|
+
return {
|
|
164
|
+
caseName: evalCase.name,
|
|
165
|
+
outcome: "error",
|
|
166
|
+
input: evalCase.input,
|
|
167
|
+
output: "",
|
|
168
|
+
durationMs: Date.now() - start,
|
|
169
|
+
error: err instanceof Error ? err.message : String(err),
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Run an eval suite against an agent endpoint.
|
|
176
|
+
* Returns a full EvalReport including per-case results and aggregate summary.
|
|
177
|
+
*/
|
|
178
|
+
export async function runEvals(
|
|
179
|
+
suite: EvalSuite,
|
|
180
|
+
opts: RunEvalsOptions,
|
|
181
|
+
): Promise<EvalReport> {
|
|
182
|
+
const concurrency = opts.concurrency ?? 4;
|
|
183
|
+
const cases = suite.cases;
|
|
184
|
+
const results: EvalResult[] = [];
|
|
185
|
+
|
|
186
|
+
// Process in batches of `concurrency`
|
|
187
|
+
for (let i = 0; i < cases.length; i += concurrency) {
|
|
188
|
+
const batch = cases.slice(i, i + concurrency);
|
|
189
|
+
const batch_results = await Promise.all(batch.map((c) => runCase(c, opts)));
|
|
190
|
+
results.push(...batch_results);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const passed = results.filter((r) => r.outcome === "pass").length;
|
|
194
|
+
const failed = results.filter((r) => r.outcome === "fail").length;
|
|
195
|
+
const errored = results.filter((r) => r.outcome === "error").length;
|
|
196
|
+
|
|
197
|
+
const judgedResults = results.filter((r) => r.judgeScore !== undefined);
|
|
198
|
+
const avgJudgeScore = judgedResults.length
|
|
199
|
+
? judgedResults.reduce((s, r) => s + (r.judgeScore ?? 0), 0) / judgedResults.length
|
|
200
|
+
: undefined;
|
|
201
|
+
|
|
202
|
+
const trajResults = results.filter((r) => r.trajectoryScore !== undefined);
|
|
203
|
+
const avgTrajectoryScore = trajResults.length
|
|
204
|
+
? trajResults.reduce((s, r) => s + (r.trajectoryScore ?? 0), 0) / trajResults.length
|
|
205
|
+
: undefined;
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
suite: suite.name,
|
|
209
|
+
endpoint: opts.endpoint,
|
|
210
|
+
runAt: new Date().toISOString(),
|
|
211
|
+
results,
|
|
212
|
+
summary: { total: cases.length, passed, failed, errored, avgJudgeScore, avgTrajectoryScore },
|
|
213
|
+
};
|
|
214
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
// Copyright (c) 2026 Aroha Labs
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
// ── Eval case ─────────────────────────────────────────────────────────────────
|
|
5
|
+
|
|
6
|
+
export interface EvalCase {
|
|
7
|
+
/** Human-readable name for this test case. */
|
|
8
|
+
name: string;
|
|
9
|
+
/** The message sent to the agent. */
|
|
10
|
+
input: string;
|
|
11
|
+
/** Optional expected output — used for exact/fuzzy string matching. */
|
|
12
|
+
expected?: string;
|
|
13
|
+
/**
|
|
14
|
+
* Optional sequence of expected tool call names in order.
|
|
15
|
+
* Used for trajectory evaluation — did the agent call the right tools?
|
|
16
|
+
*/
|
|
17
|
+
expectedTrajectory?: string[];
|
|
18
|
+
/** Optional tags for filtering / grouping results. */
|
|
19
|
+
tags?: string[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface EvalSuite {
|
|
23
|
+
/** Suite name shown in reports. */
|
|
24
|
+
name: string;
|
|
25
|
+
cases: EvalCase[];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// ── Eval result ───────────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
export type EvalOutcome = "pass" | "fail" | "error";
|
|
31
|
+
|
|
32
|
+
export interface EvalResult {
|
|
33
|
+
caseName: string;
|
|
34
|
+
outcome: EvalOutcome;
|
|
35
|
+
input: string;
|
|
36
|
+
output: string;
|
|
37
|
+
expected?: string;
|
|
38
|
+
/** Exact string match score 0–1. */
|
|
39
|
+
exactScore?: number;
|
|
40
|
+
/** LLM-as-judge score 0–1, with reasoning. */
|
|
41
|
+
judgeScore?: number;
|
|
42
|
+
judgeReasoning?: string;
|
|
43
|
+
/** Trajectory match fraction (matched steps / total expected). */
|
|
44
|
+
trajectoryScore?: number;
|
|
45
|
+
observedTrajectory?: string[];
|
|
46
|
+
durationMs: number;
|
|
47
|
+
error?: string;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface EvalReport {
|
|
51
|
+
suite: string;
|
|
52
|
+
endpoint: string;
|
|
53
|
+
runAt: string;
|
|
54
|
+
results: EvalResult[];
|
|
55
|
+
summary: {
|
|
56
|
+
total: number;
|
|
57
|
+
passed: number;
|
|
58
|
+
failed: number;
|
|
59
|
+
errored: number;
|
|
60
|
+
/** Average judge score across all judged cases (0–1). */
|
|
61
|
+
avgJudgeScore?: number;
|
|
62
|
+
/** Average trajectory score across all trajectory cases (0–1). */
|
|
63
|
+
avgTrajectoryScore?: number;
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// ── Runner options ────────────────────────────────────────────────────────────
|
|
68
|
+
|
|
69
|
+
export interface RunEvalsOptions {
|
|
70
|
+
/** URL of the agent's /v1/run endpoint. */
|
|
71
|
+
endpoint: string;
|
|
72
|
+
/** Bearer token for authenticated agents. */
|
|
73
|
+
bearerToken?: string;
|
|
74
|
+
/**
|
|
75
|
+
* LLM judge configuration. When provided, each case is graded by an LLM
|
|
76
|
+
* that receives (input, output, expected?) and returns a score + reasoning.
|
|
77
|
+
*/
|
|
78
|
+
judge?: {
|
|
79
|
+
/** Any OpenAI-compatible chat completions endpoint. */
|
|
80
|
+
apiUrl: string;
|
|
81
|
+
apiKey: string;
|
|
82
|
+
model: string;
|
|
83
|
+
};
|
|
84
|
+
/**
|
|
85
|
+
* Score threshold 0–1 below which a case is marked "fail".
|
|
86
|
+
* Applies to judgeScore when a judge is configured. Default: 0.7
|
|
87
|
+
*/
|
|
88
|
+
passThreshold?: number;
|
|
89
|
+
/** Maximum concurrent agent calls. Default: 4 */
|
|
90
|
+
concurrency?: number;
|
|
91
|
+
/** Timeout per case in ms. Default: 30000 */
|
|
92
|
+
timeoutMs?: number;
|
|
93
|
+
}
|