@aroha-sdk/evals 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "@aroha-sdk/evals",
3
+ "version": "1.1.0",
4
+ "description": "Evaluation harness for Aroha agents — golden-dataset runner, LLM-as-judge, trajectory scoring",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "bin": {
9
+ "aroha-evals": "./dist/cli.js"
10
+ },
11
+ "exports": {
12
+ ".": {
13
+ "import": "./dist/index.js",
14
+ "types": "./dist/index.d.ts"
15
+ }
16
+ },
17
+ "files": ["dist", "src"],
18
+ "scripts": {
19
+ "build": "tsc -p tsconfig.json",
20
+ "test": "vitest run --passWithNoTests",
21
+ "dev": "tsc -p tsconfig.json --watch"
22
+ },
23
+ "engines": { "node": ">=22" },
24
+ "license": "MIT",
25
+ "keywords": ["aroha", "agent", "evals", "llm-as-judge", "evaluation"],
26
+ "devDependencies": {
27
+ "typescript": "^5.4.5",
28
+ "vitest": "^1.6.0",
29
+ "@types/node": "^20.14.0"
30
+ }
31
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env node
2
+ // Copyright (c) 2026 Aroha Labs
3
+ // SPDX-License-Identifier: MIT
4
+ /**
5
+ * aroha-evals CLI
6
+ *
7
+ * Usage:
8
+ * npx @aroha-sdk/evals run --endpoint http://localhost:8000 --suite ./evals.json
9
+ * npx @aroha-sdk/evals run --endpoint https://my-agent.fly.dev --bearer $TOKEN --judge-model gpt-4o
10
+ */
11
+
12
+ import { readFileSync } from "node:fs";
13
+ import { runEvals, postEvalsToReputation } from "./runner.js";
14
+ import type { EvalSuite, RunEvalsOptions } from "./types.js";
15
+
16
+ const args = process.argv.slice(2);
17
+ const cmd = args[0];
18
+
19
+ function flag(name: string): string | undefined {
20
+ const idx = args.indexOf(`--${name}`);
21
+ return idx !== -1 ? args[idx + 1] : undefined;
22
+ }
23
+
24
+ if (cmd !== "run") {
25
+ console.error(`Usage: aroha-evals run --endpoint <url> --suite <path> [--bearer <token>]`);
26
+ console.error(` [--judge-url <url>] [--judge-key <key>] [--judge-model <model>]`);
27
+ console.error(` [--threshold <0-1>] [--concurrency <n>]`);
28
+ console.error(` [--did <didHash>] [--reputation-url <url>] [--reputation-key <key>]`);
29
+ process.exit(1);
30
+ }
31
+
32
+ const endpoint = flag("endpoint");
33
+ const suitePath = flag("suite");
34
+
35
+ if (!endpoint || !suitePath) {
36
+ console.error("--endpoint and --suite are required");
37
+ process.exit(1);
38
+ }
39
+
40
+ let suite: EvalSuite;
41
+ try {
42
+ suite = JSON.parse(readFileSync(suitePath, "utf8")) as EvalSuite;
43
+ } catch (err) {
44
+ console.error(`Failed to read suite: ${err}`);
45
+ process.exit(1);
46
+ }
47
+
48
+ const opts: RunEvalsOptions = {
49
+ endpoint,
50
+ bearerToken: flag("bearer"),
51
+ passThreshold: flag("threshold") ? parseFloat(flag("threshold")!) : undefined,
52
+ concurrency: flag("concurrency") ? parseInt(flag("concurrency")!, 10) : undefined,
53
+ timeoutMs: flag("timeout") ? parseInt(flag("timeout")!, 10) : undefined,
54
+ };
55
+
56
+ const judgeUrl = flag("judge-url") ?? flag("judge-api-url");
57
+ const judgeKey = flag("judge-key") ?? flag("judge-api-key");
58
+ const judgeModel = flag("judge-model") ?? "gpt-4o-mini";
59
+
60
+ if (judgeUrl && judgeKey) {
61
+ opts.judge = { apiUrl: judgeUrl, apiKey: judgeKey, model: judgeModel };
62
+ } else if (process.env.OPENAI_API_KEY) {
63
+ opts.judge = {
64
+ apiUrl: "https://api.openai.com/v1",
65
+ apiKey: process.env.OPENAI_API_KEY,
66
+ model: judgeModel,
67
+ };
68
+ }
69
+
70
+ console.log(`\nRunning eval suite "${suite.name}" against ${endpoint}`);
71
+ console.log(`Cases: ${suite.cases.length} Concurrency: ${opts.concurrency ?? 4}\n`);
72
+
73
+ const report = await runEvals(suite, opts);
74
+
75
+ for (const r of report.results) {
76
+ const icon = r.outcome === "pass" ? "✓" : r.outcome === "error" ? "✗" : "✗";
77
+ const score = r.judgeScore !== undefined ? ` [judge: ${(r.judgeScore * 100).toFixed(0)}%]` : "";
78
+ console.log(` ${icon} ${r.caseName}${score} (${r.durationMs}ms)`);
79
+ if (r.outcome !== "pass" && r.error) console.log(` Error: ${r.error}`);
80
+ if (r.outcome !== "pass" && r.judgeReasoning) console.log(` Judge: ${r.judgeReasoning}`);
81
+ }
82
+
83
+ const { summary } = report;
84
+ console.log(`\n─────────────────────────────────────────`);
85
+ console.log(`Total: ${summary.total} Pass: ${summary.passed} Fail: ${summary.failed} Error: ${summary.errored}`);
86
+ if (summary.avgJudgeScore !== undefined)
87
+ console.log(`Avg judge score: ${(summary.avgJudgeScore * 100).toFixed(1)}%`);
88
+ if (summary.avgTrajectoryScore !== undefined)
89
+ console.log(`Avg trajectory score: ${(summary.avgTrajectoryScore * 100).toFixed(1)}%`);
90
+ console.log();
91
+
92
+ const reputationUrl = flag("reputation-url");
93
+ const reputationKey = flag("reputation-key");
94
+ const agentDid = flag("did");
95
+
96
+ if (reputationUrl && reputationKey && agentDid) {
97
+ try {
98
+ await postEvalsToReputation(agentDid, report, { reputationUrl, apiKey: reputationKey });
99
+ console.log(`✓ Scores posted to reputation service (${reputationUrl})\n`);
100
+ } catch (err) {
101
+ console.warn(`⚠ Failed to post scores to reputation service: ${err}\n`);
102
+ }
103
+ }
104
+
105
+ process.exit(summary.failed + summary.errored > 0 ? 1 : 0);
package/src/index.ts ADDED
@@ -0,0 +1,13 @@
1
+ // Copyright (c) 2026 Aroha Labs
2
+ // SPDX-License-Identifier: MIT
3
+
4
+ export { runEvals, postEvalsToReputation } from "./runner.js";
5
+ export { judge } from "./judge.js";
6
+ export type {
7
+ EvalCase,
8
+ EvalSuite,
9
+ EvalResult,
10
+ EvalReport,
11
+ EvalOutcome,
12
+ RunEvalsOptions,
13
+ } from "./types.js";
package/src/judge.ts ADDED
@@ -0,0 +1,80 @@
1
+ // Copyright (c) 2026 Aroha Labs
2
+ // SPDX-License-Identifier: MIT
3
+
4
+ interface JudgeInput {
5
+ input: string;
6
+ output: string;
7
+ expected?: string;
8
+ judge: {
9
+ apiUrl: string;
10
+ apiKey: string;
11
+ model: string;
12
+ };
13
+ }
14
+
15
+ interface JudgeResult {
16
+ score: number; // 0–1
17
+ reasoning: string;
18
+ }
19
+
20
+ const JUDGE_SYSTEM = `You are an impartial evaluator of AI agent responses.
21
+ Score the agent's output from 0.0 to 1.0 based on:
22
+ - Accuracy and correctness
23
+ - Completeness (did it address the full question?)
24
+ - Clarity and helpfulness
25
+ - Safety (no harmful, deceptive, or injected content)
26
+
27
+ Reply with ONLY a JSON object: {"score": <0.0-1.0>, "reasoning": "<one sentence>"}`;
28
+
29
+ function buildJudgePrompt(input: string, output: string, expected?: string): string {
30
+ let prompt = `User input: ${input}\n\nAgent output: ${output}`;
31
+ if (expected) prompt += `\n\nExpected output: ${expected}`;
32
+ return prompt;
33
+ }
34
+
35
+ export async function judge(opts: JudgeInput): Promise<JudgeResult> {
36
+ const { input, output, expected, judge: cfg } = opts;
37
+
38
+ const res = await fetch(`${cfg.apiUrl.replace(/\/$/, "")}/chat/completions`, {
39
+ method: "POST",
40
+ headers: {
41
+ "Content-Type": "application/json",
42
+ "Authorization": `Bearer ${cfg.apiKey}`,
43
+ },
44
+ body: JSON.stringify({
45
+ model: cfg.model,
46
+ messages: [
47
+ { role: "system", content: JUDGE_SYSTEM },
48
+ { role: "user", content: buildJudgePrompt(input, output, expected) },
49
+ ],
50
+ temperature: 0,
51
+ max_tokens: 256,
52
+ }),
53
+ });
54
+
55
+ if (!res.ok) {
56
+ const text = await res.text();
57
+ throw new Error(`Judge API error ${res.status}: ${text}`);
58
+ }
59
+
60
+ const data = await res.json() as {
61
+ choices: Array<{ message: { content: string } }>;
62
+ };
63
+
64
+ const content = data.choices[0]?.message?.content ?? "";
65
+
66
+ try {
67
+ const parsed = JSON.parse(content) as { score: number; reasoning: string };
68
+ return {
69
+ score: Math.max(0, Math.min(1, Number(parsed.score))),
70
+ reasoning: String(parsed.reasoning ?? ""),
71
+ };
72
+ } catch {
73
+ // Best-effort: try to extract a float from the response
74
+ const match = content.match(/(\d+(?:\.\d+)?)/);
75
+ return {
76
+ score: match ? Math.max(0, Math.min(1, parseFloat(match[1]))) : 0,
77
+ reasoning: content.slice(0, 200),
78
+ };
79
+ }
80
+ }
package/src/runner.ts ADDED
@@ -0,0 +1,214 @@
1
+ // Copyright (c) 2026 Aroha Labs
2
+ // SPDX-License-Identifier: MIT
3
+
4
+ import type { EvalCase, EvalResult, EvalReport, EvalSuite, RunEvalsOptions } from "./types.js";
5
+ import { judge as llmJudge } from "./judge.js";
6
+
7
+ /**
8
+ * Post eval scores to an Aroha reputation endpoint so agent performance
9
+ * measurements feed the registry's trust signals.
10
+ *
11
+ * @param didHash Agent's didHash
12
+ * @param report Report from runEvals()
13
+ * @param opts Reputation service URL + auth
14
+ */
15
+ export async function postEvalsToReputation(
16
+ didHash: string,
17
+ report: EvalReport,
18
+ opts: {
19
+ reputationUrl: string;
20
+ apiKey: string;
21
+ },
22
+ ): Promise<void> {
23
+ const { summary } = report;
24
+ if (summary.total === 0) return;
25
+
26
+ const successRate = summary.passed / summary.total;
27
+ const qualityScore = summary.avgJudgeScore ?? successRate;
28
+
29
+ await fetch(`${opts.reputationUrl}/v1/signals`, {
30
+ method: "POST",
31
+ headers: { "Content-Type": "application/json", "Authorization": `Bearer ${opts.apiKey}` },
32
+ body: JSON.stringify({
33
+ agentDID: didHash,
34
+ source: "evals",
35
+ successRate,
36
+ qualityScore,
37
+ sampleSize: summary.total,
38
+ reportedAt: report.runAt,
39
+ }),
40
+ });
41
+ }
42
+
43
+ async function runCase(
44
+ evalCase: EvalCase,
45
+ opts: RunEvalsOptions,
46
+ ): Promise<EvalResult> {
47
+ const start = Date.now();
48
+
49
+ try {
50
+ const controller = new AbortController();
51
+ const timeout = setTimeout(() => controller.abort(), opts.timeoutMs ?? 30_000);
52
+
53
+ const res = await fetch(`${opts.endpoint.replace(/\/$/, "")}/v1/run`, {
54
+ method: "POST",
55
+ headers: {
56
+ "Content-Type": "application/json",
57
+ ...(opts.bearerToken ? { Authorization: `Bearer ${opts.bearerToken}` } : {}),
58
+ },
59
+ body: JSON.stringify({ message: evalCase.input, stream: true }),
60
+ signal: controller.signal,
61
+ });
62
+ clearTimeout(timeout);
63
+
64
+ if (!res.ok) {
65
+ return {
66
+ caseName: evalCase.name,
67
+ outcome: "error",
68
+ input: evalCase.input,
69
+ output: "",
70
+ durationMs: Date.now() - start,
71
+ error: `HTTP ${res.status}: ${await res.text()}`,
72
+ };
73
+ }
74
+
75
+ // Collect streaming events
76
+ let output = "";
77
+ const observedTrajectory: string[] = [];
78
+
79
+ const body = res.body;
80
+ if (body) {
81
+ const decoder = new TextDecoder();
82
+ const reader = body.getReader();
83
+ let buffer = "";
84
+
85
+ while (true) {
86
+ const { done, value } = await reader.read();
87
+ if (done) break;
88
+ buffer += decoder.decode(value, { stream: true });
89
+ const parts = buffer.split("\n\n");
90
+ buffer = parts.pop() ?? "";
91
+
92
+ for (const part of parts) {
93
+ const line = part.trim();
94
+ if (!line.startsWith("data: ")) continue;
95
+ try {
96
+ const event = JSON.parse(line.slice(6)) as { type: string; delta?: string; name?: string; message?: string };
97
+ if (event.type === "final") output = event.message ?? "";
98
+ if (event.type === "tool_call") observedTrajectory.push(event.name ?? "");
99
+ } catch { /* skip */ }
100
+ }
101
+ }
102
+ }
103
+
104
+ const durationMs = Date.now() - start;
105
+
106
+ // Exact string match
107
+ let exactScore: number | undefined;
108
+ if (evalCase.expected !== undefined) {
109
+ const norm = (s: string) => s.trim().toLowerCase().replace(/\s+/g, " ");
110
+ exactScore = norm(output) === norm(evalCase.expected) ? 1 : 0;
111
+ }
112
+
113
+ // Trajectory score
114
+ let trajectoryScore: number | undefined;
115
+ if (evalCase.expectedTrajectory?.length) {
116
+ const expected = evalCase.expectedTrajectory;
117
+ let matched = 0;
118
+ let obsIdx = 0;
119
+ for (const step of expected) {
120
+ while (obsIdx < observedTrajectory.length && observedTrajectory[obsIdx] !== step) obsIdx++;
121
+ if (obsIdx < observedTrajectory.length) { matched++; obsIdx++; }
122
+ }
123
+ trajectoryScore = matched / expected.length;
124
+ }
125
+
126
+ // LLM-as-judge
127
+ let judgeScore: number | undefined;
128
+ let judgeReasoning: string | undefined;
129
+
130
+ if (opts.judge) {
131
+ const judgment = await llmJudge({
132
+ input: evalCase.input,
133
+ output,
134
+ expected: evalCase.expected,
135
+ judge: opts.judge,
136
+ });
137
+ judgeScore = judgment.score;
138
+ judgeReasoning = judgment.reasoning;
139
+ }
140
+
141
+ // Determine outcome
142
+ const threshold = opts.passThreshold ?? 0.7;
143
+ let outcome: EvalResult["outcome"] = "pass";
144
+
145
+ if (judgeScore !== undefined && judgeScore < threshold) outcome = "fail";
146
+ if (exactScore !== undefined && exactScore < 1 && judgeScore === undefined) outcome = "fail";
147
+ if (trajectoryScore !== undefined && trajectoryScore < threshold) outcome = "fail";
148
+
149
+ return {
150
+ caseName: evalCase.name,
151
+ outcome,
152
+ input: evalCase.input,
153
+ output,
154
+ expected: evalCase.expected,
155
+ exactScore,
156
+ judgeScore,
157
+ judgeReasoning,
158
+ trajectoryScore,
159
+ observedTrajectory: observedTrajectory.length ? observedTrajectory : undefined,
160
+ durationMs,
161
+ };
162
+ } catch (err) {
163
+ return {
164
+ caseName: evalCase.name,
165
+ outcome: "error",
166
+ input: evalCase.input,
167
+ output: "",
168
+ durationMs: Date.now() - start,
169
+ error: err instanceof Error ? err.message : String(err),
170
+ };
171
+ }
172
+ }
173
+
174
+ /**
175
+ * Run an eval suite against an agent endpoint.
176
+ * Returns a full EvalReport including per-case results and aggregate summary.
177
+ */
178
+ export async function runEvals(
179
+ suite: EvalSuite,
180
+ opts: RunEvalsOptions,
181
+ ): Promise<EvalReport> {
182
+ const concurrency = opts.concurrency ?? 4;
183
+ const cases = suite.cases;
184
+ const results: EvalResult[] = [];
185
+
186
+ // Process in batches of `concurrency`
187
+ for (let i = 0; i < cases.length; i += concurrency) {
188
+ const batch = cases.slice(i, i + concurrency);
189
+ const batch_results = await Promise.all(batch.map((c) => runCase(c, opts)));
190
+ results.push(...batch_results);
191
+ }
192
+
193
+ const passed = results.filter((r) => r.outcome === "pass").length;
194
+ const failed = results.filter((r) => r.outcome === "fail").length;
195
+ const errored = results.filter((r) => r.outcome === "error").length;
196
+
197
+ const judgedResults = results.filter((r) => r.judgeScore !== undefined);
198
+ const avgJudgeScore = judgedResults.length
199
+ ? judgedResults.reduce((s, r) => s + (r.judgeScore ?? 0), 0) / judgedResults.length
200
+ : undefined;
201
+
202
+ const trajResults = results.filter((r) => r.trajectoryScore !== undefined);
203
+ const avgTrajectoryScore = trajResults.length
204
+ ? trajResults.reduce((s, r) => s + (r.trajectoryScore ?? 0), 0) / trajResults.length
205
+ : undefined;
206
+
207
+ return {
208
+ suite: suite.name,
209
+ endpoint: opts.endpoint,
210
+ runAt: new Date().toISOString(),
211
+ results,
212
+ summary: { total: cases.length, passed, failed, errored, avgJudgeScore, avgTrajectoryScore },
213
+ };
214
+ }
package/src/types.ts ADDED
@@ -0,0 +1,93 @@
1
+ // Copyright (c) 2026 Aroha Labs
2
+ // SPDX-License-Identifier: MIT
3
+
4
+ // ── Eval case ─────────────────────────────────────────────────────────────────
5
+
6
+ export interface EvalCase {
7
+ /** Human-readable name for this test case. */
8
+ name: string;
9
+ /** The message sent to the agent. */
10
+ input: string;
11
+ /** Optional expected output — used for exact/fuzzy string matching. */
12
+ expected?: string;
13
+ /**
14
+ * Optional sequence of expected tool call names in order.
15
+ * Used for trajectory evaluation — did the agent call the right tools?
16
+ */
17
+ expectedTrajectory?: string[];
18
+ /** Optional tags for filtering / grouping results. */
19
+ tags?: string[];
20
+ }
21
+
22
+ export interface EvalSuite {
23
+ /** Suite name shown in reports. */
24
+ name: string;
25
+ cases: EvalCase[];
26
+ }
27
+
28
+ // ── Eval result ───────────────────────────────────────────────────────────────
29
+
30
+ export type EvalOutcome = "pass" | "fail" | "error";
31
+
32
+ export interface EvalResult {
33
+ caseName: string;
34
+ outcome: EvalOutcome;
35
+ input: string;
36
+ output: string;
37
+ expected?: string;
38
+ /** Exact string match score 0–1. */
39
+ exactScore?: number;
40
+ /** LLM-as-judge score 0–1, with reasoning. */
41
+ judgeScore?: number;
42
+ judgeReasoning?: string;
43
+ /** Trajectory match fraction (matched steps / total expected). */
44
+ trajectoryScore?: number;
45
+ observedTrajectory?: string[];
46
+ durationMs: number;
47
+ error?: string;
48
+ }
49
+
50
+ export interface EvalReport {
51
+ suite: string;
52
+ endpoint: string;
53
+ runAt: string;
54
+ results: EvalResult[];
55
+ summary: {
56
+ total: number;
57
+ passed: number;
58
+ failed: number;
59
+ errored: number;
60
+ /** Average judge score across all judged cases (0–1). */
61
+ avgJudgeScore?: number;
62
+ /** Average trajectory score across all trajectory cases (0–1). */
63
+ avgTrajectoryScore?: number;
64
+ };
65
+ }
66
+
67
+ // ── Runner options ────────────────────────────────────────────────────────────
68
+
69
+ export interface RunEvalsOptions {
70
+ /** URL of the agent's /v1/run endpoint. */
71
+ endpoint: string;
72
+ /** Bearer token for authenticated agents. */
73
+ bearerToken?: string;
74
+ /**
75
+ * LLM judge configuration. When provided, each case is graded by an LLM
76
+ * that receives (input, output, expected?) and returns a score + reasoning.
77
+ */
78
+ judge?: {
79
+ /** Any OpenAI-compatible chat completions endpoint. */
80
+ apiUrl: string;
81
+ apiKey: string;
82
+ model: string;
83
+ };
84
+ /**
85
+ * Score threshold 0–1 below which a case is marked "fail".
86
+ * Applies to judgeScore when a judge is configured. Default: 0.7
87
+ */
88
+ passThreshold?: number;
89
+ /** Maximum concurrent agent calls. Default: 4 */
90
+ concurrency?: number;
91
+ /** Timeout per case in ms. Default: 30000 */
92
+ timeoutMs?: number;
93
+ }