@lobu/cli 3.0.7 → 3.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/api/credentials.d.ts +2 -1
  2. package/dist/api/credentials.d.ts.map +1 -1
  3. package/dist/api/credentials.js +30 -4
  4. package/dist/api/credentials.js.map +1 -1
  5. package/dist/commands/chat.d.ts +1 -0
  6. package/dist/commands/chat.d.ts.map +1 -1
  7. package/dist/commands/chat.js +65 -5
  8. package/dist/commands/chat.js.map +1 -1
  9. package/dist/commands/eval.d.ts +10 -0
  10. package/dist/commands/eval.d.ts.map +1 -0
  11. package/dist/commands/eval.js +194 -0
  12. package/dist/commands/eval.js.map +1 -0
  13. package/dist/commands/init.d.ts.map +1 -1
  14. package/dist/commands/init.js +54 -2
  15. package/dist/commands/init.js.map +1 -1
  16. package/dist/commands/login.d.ts +1 -0
  17. package/dist/commands/login.d.ts.map +1 -1
  18. package/dist/commands/login.js +51 -50
  19. package/dist/commands/login.js.map +1 -1
  20. package/dist/commands/logout.d.ts.map +1 -1
  21. package/dist/commands/logout.js +15 -1
  22. package/dist/commands/logout.js.map +1 -1
  23. package/dist/eval/client.d.ts +42 -0
  24. package/dist/eval/client.d.ts.map +1 -0
  25. package/dist/eval/client.js +166 -0
  26. package/dist/eval/client.js.map +1 -0
  27. package/dist/eval/grader.d.ts +14 -0
  28. package/dist/eval/grader.d.ts.map +1 -0
  29. package/dist/eval/grader.js +177 -0
  30. package/dist/eval/grader.js.map +1 -0
  31. package/dist/eval/reporter.d.ts +8 -0
  32. package/dist/eval/reporter.d.ts.map +1 -0
  33. package/dist/eval/reporter.js +242 -0
  34. package/dist/eval/reporter.js.map +1 -0
  35. package/dist/eval/runner.d.ts +11 -0
  36. package/dist/eval/runner.d.ts.map +1 -0
  37. package/dist/eval/runner.js +172 -0
  38. package/dist/eval/runner.js.map +1 -0
  39. package/dist/eval/types.d.ts +243 -0
  40. package/dist/eval/types.d.ts.map +1 -0
  41. package/dist/eval/types.js +31 -0
  42. package/dist/eval/types.js.map +1 -0
  43. package/dist/index.d.ts.map +1 -1
  44. package/dist/index.js +17 -0
  45. package/dist/index.js.map +1 -1
  46. package/dist/templates/.gitignore.tmpl +4 -0
  47. package/package.json +2 -1
@@ -0,0 +1,177 @@
1
+ /**
2
+ * LLM-as-judge grader using the Lobu gateway.
3
+ *
4
+ * Borrows the Claude outcomes pattern: a separate evaluator context
5
+ * grades agent output against a markdown rubric with per-criterion scoring.
6
+ */
7
+ import { createSession, deleteSession, sendAndCollect, } from "./client.js";
8
+ const INLINE_JUDGE_PROMPT = `You are a strict evaluator. You will be given an AI agent's response and a criteria to judge it against.
9
+
10
+ You MUST respond with ONLY a JSON object, no other text:
11
+ {"passed": true, "score": 0.85, "reason": "one sentence explanation"}
12
+
13
+ Rules:
14
+ - "passed": true if the response meets the criteria, false otherwise
15
+ - "score": a number between 0.0 and 1.0
16
+ - "reason": a brief explanation (one sentence)
17
+ - Return ONLY the JSON object, nothing else
18
+
19
+ ## Criteria
20
+ {{criteria}}
21
+
22
+ ## Agent Response
23
+ {{response}}`;
24
+ const RUBRIC_JUDGE_PROMPT = `You are a strict evaluator. Grade the agent's conversation against each criterion in the rubric.
25
+
26
+ You MUST respond with ONLY a JSON object, no other text:
27
+ {"criteria": [{"name": "criterion name", "passed": true, "explanation": "why"}], "score": 0.85}
28
+
29
+ Rules:
30
+ - Score each criterion independently
31
+ - "score" is the overall score 0.0-1.0
32
+ - Return ONLY the JSON object, nothing else
33
+
34
+ ## Rubric
35
+ {{rubric}}
36
+
37
+ ## Conversation
38
+ {{transcript}}`;
39
+ export async function gradeWithRubric(gatewayUrl, authToken, rubricContent, turns, timeoutMs) {
40
+ const transcript = turns
41
+ .map((t) => `User: ${t.user}\nAgent: ${t.agent}`)
42
+ .join("\n\n");
43
+ const prompt = RUBRIC_JUDGE_PROMPT.replace("{{rubric}}", rubricContent).replace("{{transcript}}", transcript);
44
+ const session = await createSession(gatewayUrl, authToken, {
45
+ forceNew: true,
46
+ dryRun: true,
47
+ });
48
+ try {
49
+ const response = await sendAndCollect(session, prompt, timeoutMs);
50
+ return parseGraderResponse(response);
51
+ }
52
+ finally {
53
+ await deleteSession(session);
54
+ }
55
+ }
56
+ export async function gradeInline(gatewayUrl, authToken, criteria, agentResponse, timeoutMs) {
57
+ const prompt = INLINE_JUDGE_PROMPT.replace("{{criteria}}", criteria).replace("{{response}}", agentResponse);
58
+ const session = await createSession(gatewayUrl, authToken, {
59
+ forceNew: true,
60
+ dryRun: true,
61
+ });
62
+ try {
63
+ const response = await sendAndCollect(session, prompt, timeoutMs);
64
+ return parseInlineResponse(response);
65
+ }
66
+ finally {
67
+ await deleteSession(session);
68
+ }
69
+ }
70
+ function parseGraderResponse(response) {
71
+ if (response.error) {
72
+ return {
73
+ score: 0,
74
+ criteria: [{ name: "error", passed: false, explanation: response.error }],
75
+ };
76
+ }
77
+ try {
78
+ const json = extractJSON(response.text);
79
+ const parsed = JSON.parse(json);
80
+ return {
81
+ score: typeof parsed.score === "number" ? parsed.score : 0,
82
+ criteria: Array.isArray(parsed.criteria)
83
+ ? parsed.criteria.map((c) => ({
84
+ name: String(c.name ?? ""),
85
+ passed: Boolean(c.passed),
86
+ explanation: String(c.explanation ?? ""),
87
+ }))
88
+ : [],
89
+ };
90
+ }
91
+ catch {
92
+ // Fallback: try to infer from prose response
93
+ return inferRubricFromText(response.text);
94
+ }
95
+ }
96
+ function parseInlineResponse(response) {
97
+ if (response.error) {
98
+ return { passed: false, score: 0, reason: response.error };
99
+ }
100
+ try {
101
+ const json = extractJSON(response.text);
102
+ const parsed = JSON.parse(json);
103
+ return {
104
+ passed: Boolean(parsed.passed),
105
+ score: typeof parsed.score === "number" ? parsed.score : parsed.passed ? 1 : 0,
106
+ reason: String(parsed.reason ?? ""),
107
+ };
108
+ }
109
+ catch {
110
+ // Fallback: infer pass/fail from prose
111
+ return inferInlineFromText(response.text);
112
+ }
113
+ }
114
+ /** Extract JSON from text that may contain markdown fences or surrounding prose. */
115
+ function extractJSON(text) {
116
+ // Try to find JSON in markdown code block
117
+ const fenced = text.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
118
+ if (fenced?.[1])
119
+ return fenced[1].trim();
120
+ // Try to find raw JSON object
121
+ const braceMatch = text.match(/\{[\s\S]*\}/);
122
+ if (braceMatch)
123
+ return braceMatch[0];
124
+ return text.trim();
125
+ }
126
+ /** Fallback: infer pass/fail from prose when JSON parsing fails. */
127
+ function inferInlineFromText(text) {
128
+ const lower = text.toLowerCase();
129
+ const positiveSignals = [
130
+ "yes",
131
+ "pass",
132
+ "meets",
133
+ "satisfies",
134
+ "correct",
135
+ "appropriate",
136
+ "good",
137
+ "well",
138
+ ];
139
+ const negativeSignals = [
140
+ "no",
141
+ "fail",
142
+ "does not",
143
+ "doesn't",
144
+ "incorrect",
145
+ "missing",
146
+ "lacks",
147
+ "poor",
148
+ ];
149
+ const posCount = positiveSignals.filter((s) => lower.includes(s)).length;
150
+ const negCount = negativeSignals.filter((s) => lower.includes(s)).length;
151
+ const passed = posCount > negCount;
152
+ return {
153
+ passed,
154
+ score: passed ? 0.7 : 0.3,
155
+ reason: `Inferred from prose (pos=${posCount}, neg=${negCount}): ${text.slice(0, 100)}`,
156
+ };
157
+ }
158
+ /** Fallback: infer rubric result from prose when JSON parsing fails. */
159
+ function inferRubricFromText(text) {
160
+ const lower = text.toLowerCase();
161
+ const positiveSignals = ["pass", "meets", "satisfies", "good", "correct"];
162
+ const negativeSignals = ["fail", "does not", "doesn't", "missing", "poor"];
163
+ const posCount = positiveSignals.filter((s) => lower.includes(s)).length;
164
+ const negCount = negativeSignals.filter((s) => lower.includes(s)).length;
165
+ const passed = posCount > negCount;
166
+ return {
167
+ score: passed ? 0.7 : 0.3,
168
+ criteria: [
169
+ {
170
+ name: "overall",
171
+ passed,
172
+ explanation: `Inferred from prose: ${text.slice(0, 200)}`,
173
+ },
174
+ ],
175
+ };
176
+ }
177
+ //# sourceMappingURL=grader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../src/eval/grader.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EACL,aAAa,EACb,aAAa,EACb,cAAc,GAEf,MAAM,aAAa,CAAC;AAGrB,MAAM,mBAAmB,GAAG;;;;;;;;;;;;;;;aAef,CAAC;AAEd,MAAM,mBAAmB,GAAG;;;;;;;;;;;;;;eAcb,CAAC;AAEhB,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAkB,EAClB,SAAiB,EACjB,aAAqB,EACrB,KAAmB,EACnB,SAAiB;IAEjB,MAAM,UAAU,GAAG,KAAK;SACrB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,YAAY,CAAC,CAAC,KAAK,EAAE,CAAC;SAChD,IAAI,CAAC,MAAM,CAAC,CAAC;IAEhB,MAAM,MAAM,GAAG,mBAAmB,CAAC,OAAO,CACxC,YAAY,EACZ,aAAa,CACd,CAAC,OAAO,CAAC,gBAAgB,EAAE,UAAU,CAAC,CAAC;IAExC,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,UAAU,EAAE,SAAS,EAAE;QACzD,QAAQ,EAAE,IAAI;QACd,MAAM,EAAE,IAAI;KACb,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAClE,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;YAAS,CAAC;QACT,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,SAAiB,EACjB,QAAgB,EAChB,aAAqB,EACrB,SAAiB;IAEjB,MAAM,MAAM,GAAG,mBAAmB,CAAC,OAAO,CAAC,cAAc,EAAE,QAAQ,CAAC,CAAC,OAAO,CAC1E,cAAc,EACd,aAAa,CACd,CAAC;IAEF,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,UAAU,EAAE,SAAS,EAAE;QACzD,QAAQ,EAAE,IAAI;QACd,MAAM,EAAE,IAAI;KACb,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAClE,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;YAAS,CAAC;QACT,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;AACH,CAAC;AAED,SAAS,mBAAmB,CAAC,QAA2B;IACtD,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO;YACL,KAAK,EAAE,CAAC;YACR,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,CAAC,KAAK,EAAE,CAAC;SAC1E,CAAC;IACJ,CAAC;IAED,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAG7B,CAAC;QACF,OAAO;YACL,KAAK,EAAE,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC1D,QAAQ,EAAE,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC;gBACtC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC1B,IAAI,EAAE,MAAM,CAAC,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC;oBAC1B,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;oBACzB,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC;iBACzC,CAAC,CAAC;gBACL,CAAC,CAAC,EAAE;SACP,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,6CAA6C;QAC7C,OAAO,mBAAmB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC;AAED,SAAS,mBAAmB,CAAC,QAA2B;IAKtD,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,QAAQ,CAAC,KAAK,EAAE,CAAC;IAC7D,CAAC;IAED,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAI7B,CAAC;QACF,OAAO;YACL,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC;YAC9B,KAAK,EACH,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACzE,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;SACpC,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,uCAAuC;QACvC,OAAO,mBAAmB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC;AAED,oFAAoF;AACpF,SAAS,WAAW,CAAC,IAAY;IAC/B,0CAA0C;IAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,oCAAoC,CAAC,CAAC;IAChE,IAAI,MAAM,EAAE,CAAC,CAAC,CAAC;QAAE,OAAO,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAEzC,8BAA8B;IAC9B,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,UAAU;QAAE,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC;IAErC,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;AACrB,CAAC;AAED,oEAAoE;AACpE,SAAS,mBAAmB,CAAC,IAAY;IAKvC,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,MAAM,eAAe,GAAG;QACtB,KAAK;QACL,MAAM;QACN,OAAO;QACP,WAAW;QACX,SAAS;QACT,aAAa;QACb,MAAM;QACN,MAAM;KACP,CAAC;IACF,MAAM,eAAe,GAAG;QACtB,IAAI;QACJ,MAAM;QACN,UAAU;QACV,SAAS;QACT,WAAW;QACX,SAAS;QACT,OAAO;QACP,MAAM;KACP,CAAC;IAEF,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IACzE,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IAEzE,MAAM,MAAM,GAAG,QAAQ,GAAG,QAAQ,CAAC;IACnC,OAAO;QACL,MAAM;QACN,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG;QACzB,MAAM,EAAE,4BAA4B,QAAQ,SAAS,QAAQ,MAAM,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;KACxF,CAAC;AACJ,CAAC;AAED,wEAAwE;AACxE,SAAS,mBAAmB,CAAC,IAAY;IACvC,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,MAAM,eAAe,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;IAC1E,MAAM,eAAe,GAAG,CAAC,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;IAE3E,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IACzE,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IACzE,MAAM,MAAM,GAAG,QAAQ,GAAG,QAAQ,CAAC;IAEnC,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG;QACzB,QAAQ,EAAE;YACR;gBACE,IAAI,EAAE,SAAS;gBACf,MAAM;gBACN,WAAW,EAAE,wBAAwB,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;aAC1D;SACF;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { EvalReport } from "./types.js";
2
+ export declare function printReport(report: EvalReport): void;
3
+ export declare function saveResult(evalsDir: string, report: EvalReport): Promise<string>;
4
+ export declare function writeJsonReport(report: EvalReport, outputPath: string): Promise<void>;
5
+ export declare function loadSavedResults(evalsDir: string): Promise<EvalReport[]>;
6
+ export declare function generateComparisonReport(evalsDir: string, currentReport?: EvalReport): Promise<string>;
7
+ export declare function writeMarkdownReport(evalsDir: string, currentReport?: EvalReport): Promise<string>;
8
+ //# sourceMappingURL=reporter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reporter.d.ts","sourceRoot":"","sources":["../../src/eval/reporter.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAA2B,MAAM,YAAY,CAAC;AAItE,wBAAgB,WAAW,CAAC,MAAM,EAAE,UAAU,GAAG,IAAI,CAuBpD;AA+DD,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,UAAU,GACjB,OAAO,CAAC,MAAM,CAAC,CAejB;AAID,wBAAsB,eAAe,CACnC,MAAM,EAAE,UAAU,EAClB,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,IAAI,CAAC,CAGf;AAID,wBAAsB,gBAAgB,CACpC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,UAAU,EAAE,CAAC,CAkBvB;AAiBD,wBAAsB,wBAAwB,CAC5C,QAAQ,EAAE,MAAM,EAChB,aAAa,CAAC,EAAE,UAAU,GACzB,OAAO,CAAC,MAAM,CAAC,CA0IjB;AAED,wBAAsB,mBAAmB,CACvC,QAAQ,EAAE,MAAM,EAChB,aAAa,CAAC,EAAE,UAAU,GACzB,OAAO,CAAC,MAAM,CAAC,CAMjB"}
@@ -0,0 +1,242 @@
1
+ import { readFile, readdir, writeFile, mkdir } from "node:fs/promises";
2
+ import { join } from "node:path";
3
+ import chalk from "chalk";
4
+ // ─── Console reporter ──────────────────────────────────────────────────
5
+ export function printReport(report) {
6
+ console.log(chalk.bold(`\nAgent: ${report.agent}`));
7
+ console.log(chalk.dim(`Model: ${report.provider}/${report.model}`));
8
+ console.log(chalk.dim(`Evals: ${report.evals.length} total\n`));
9
+ for (const evalResult of report.evals) {
10
+ printEval(evalResult);
11
+ }
12
+ const { passed, failed, total } = report.summary;
13
+ const summaryColor = failed === 0 ? chalk.green : chalk.red;
14
+ console.log(summaryColor(`\nSummary: ${passed}/${total} evals passed`));
15
+ if (failed > 0) {
16
+ console.log(chalk.red(` Failed: ${report.evals
17
+ .filter((e) => e.passRate < 1 - 0.001)
18
+ .map((e) => e.name)
19
+ .join(", ")}`));
20
+ }
21
+ console.log();
22
+ }
23
+ function printEval(result) {
24
+ const trialCount = result.trials.length;
25
+ const passedCount = result.trials.filter((t) => t.passed).length;
26
+ console.log(chalk.bold(`${result.name} (${trialCount} trials)`));
27
+ for (const trial of result.trials) {
28
+ printTrial(trial);
29
+ }
30
+ const statusColor = result.passRate >= 0.8 ? chalk.green : chalk.red;
31
+ const status = result.passRate >= 0.8 ? "PASS" : "FAIL";
32
+ const tokenInfo = result.totalTokens.totalTokens
33
+ ? ` tokens=${result.totalTokens.totalTokens}`
34
+ : "";
35
+ console.log(statusColor(` ${status} ${passedCount}/${trialCount} avg=${result.avgScore.toFixed(2)} p50=${result.p50LatencyMs}ms${tokenInfo}`));
36
+ console.log();
37
+ }
38
+ function printTrial(trial) {
39
+ const icon = trial.passed ? chalk.green("✓") : chalk.red("✗");
40
+ const latency = chalk.dim(`(${(trial.durationMs / 1000).toFixed(1)}s)`);
41
+ console.log(` ${icon} Trial ${trial.trial}: ${trial.score.toFixed(2)} ${latency}`);
42
+ for (const turn of trial.turns) {
43
+ for (const assertion of turn.assertions) {
44
+ if (!assertion.passed) {
45
+ console.log(chalk.red(` └ ${assertion.type}: ${assertion.reason ?? "FAIL"}`));
46
+ }
47
+ }
48
+ }
49
+ if (trial.rubric) {
50
+ for (const criterion of trial.rubric.criteria) {
51
+ const cIcon = criterion.passed ? chalk.green("✓") : chalk.red("✗");
52
+ console.log(` ${cIcon} ${criterion.name}`);
53
+ if (!criterion.passed) {
54
+ console.log(chalk.red(` └ ${criterion.explanation}`));
55
+ }
56
+ }
57
+ }
58
+ // Show trace IDs for failed trials (for debugging in Grafana/Tempo)
59
+ if (!trial.passed) {
60
+ const traceIds = trial.turns.map((t) => t.traceId).filter(Boolean);
61
+ if (traceIds.length > 0) {
62
+ console.log(chalk.dim(` traces: ${traceIds.join(", ")}`));
63
+ }
64
+ }
65
+ }
66
+ // ─── Auto-save results ─────────────────────────────────────────────────
67
+ export async function saveResult(evalsDir, report) {
68
+ const resultsDir = join(evalsDir, ".results");
69
+ await mkdir(resultsDir, { recursive: true });
70
+ const slug = `${report.provider}-${report.model}`.replace(/[^a-z0-9-]/gi, "-");
71
+ const ts = report.timestamp.replace(/[:.]/g, "-").slice(0, 19);
72
+ const filename = `${slug}_${ts}.json`;
73
+ const filepath = join(resultsDir, filename);
74
+ await writeFile(filepath, JSON.stringify(report, null, 2));
75
+ console.log(chalk.dim(`Results saved to ${filepath}`));
76
+ return filepath;
77
+ }
78
+ // ─── JSON file output ──────────────────────────────────────────────────
79
+ export async function writeJsonReport(report, outputPath) {
80
+ await writeFile(outputPath, JSON.stringify(report, null, 2));
81
+ console.log(chalk.dim(`Results written to ${outputPath}`));
82
+ }
83
+ // ─── Markdown comparison report ────────────────────────────────────────
84
+ export async function loadSavedResults(evalsDir) {
85
+ const resultsDir = join(evalsDir, ".results");
86
+ try {
87
+ const files = await readdir(resultsDir);
88
+ const jsonFiles = files
89
+ .filter((f) => f.endsWith(".json"))
90
+ .sort()
91
+ .reverse(); // newest first
92
+ const reports = [];
93
+ for (const file of jsonFiles) {
94
+ const content = await readFile(join(resultsDir, file), "utf-8");
95
+ reports.push(JSON.parse(content));
96
+ }
97
+ return reports;
98
+ }
99
+ catch {
100
+ return [];
101
+ }
102
+ }
103
+ /**
104
+ * Group saved results by model and pick the latest run per model.
105
+ */
106
+ function latestPerModel(reports) {
107
+ const byModel = new Map();
108
+ for (const report of reports) {
109
+ const key = `${report.provider}/${report.model}`;
110
+ const existing = byModel.get(key);
111
+ if (!existing || report.timestamp > existing.timestamp) {
112
+ byModel.set(key, report);
113
+ }
114
+ }
115
+ return Array.from(byModel.values());
116
+ }
117
+ export async function generateComparisonReport(evalsDir, currentReport) {
118
+ const allReports = await loadSavedResults(evalsDir);
119
+ if (currentReport)
120
+ allReports.unshift(currentReport);
121
+ const models = latestPerModel(allReports);
122
+ if (models.length === 0) {
123
+ return "No eval results found.";
124
+ }
125
+ // Collect all eval names across all models
126
+ const evalNames = [
127
+ ...new Set(models.flatMap((m) => m.evals.map((e) => e.name))),
128
+ ].sort();
129
+ let md = "# Eval Report\n\n";
130
+ md += `Generated: ${new Date().toISOString()}\n`;
131
+ md += `Agent: ${models[0]?.agent ?? "unknown"}\n\n`;
132
+ // ─── Summary table ──────────────────────────────────────────────
133
+ md += "## Model Comparison\n\n";
134
+ md += `| Eval | ${models.map((m) => `${m.provider}/${m.model}`).join(" | ")} |\n`;
135
+ md += `| --- | ${models.map(() => "---").join(" | ")} |\n`;
136
+ for (const evalName of evalNames) {
137
+ const cells = models.map((m) => {
138
+ const evalResult = m.evals.find((e) => e.name === evalName);
139
+ if (!evalResult)
140
+ return "-";
141
+ const icon = evalResult.passRate >= 0.8 ? "PASS" : "FAIL";
142
+ return `${icon} ${evalResult.avgScore.toFixed(2)} (${Math.round(evalResult.passRate * 100)}%)`;
143
+ });
144
+ md += `| ${evalName} | ${cells.join(" | ")} |\n`;
145
+ }
146
+ // ─── Overall scores ──────────────────────────────────────────────
147
+ md += "\n## Overall Scores\n\n";
148
+ md += "| Model | Pass Rate | Avg Score | p50 Latency | Total Tokens |\n";
149
+ md += "| --- | --- | --- | --- | --- |\n";
150
+ for (const report of models) {
151
+ const overallPassRate = report.evals.length > 0
152
+ ? report.evals.filter((e) => e.passRate >= 0.8).length /
153
+ report.evals.length
154
+ : 0;
155
+ const overallAvgScore = report.evals.length > 0
156
+ ? report.evals.reduce((sum, e) => sum + e.avgScore, 0) /
157
+ report.evals.length
158
+ : 0;
159
+ const overallP50 = report.evals.length > 0
160
+ ? report.evals.reduce((sum, e) => sum + e.p50LatencyMs, 0) /
161
+ report.evals.length
162
+ : 0;
163
+ const totalTokens = report.evals.reduce((sum, e) => sum + (e.totalTokens?.totalTokens ?? 0), 0);
164
+ md += `| ${report.provider}/${report.model} | ${Math.round(overallPassRate * 100)}% | ${overallAvgScore.toFixed(2)} | ${Math.round(overallP50)}ms | ${totalTokens.toLocaleString()} |\n`;
165
+ }
166
+ // ─── Rubric details (latest run per model) ───────────────────────
167
+ for (const report of models) {
168
+ const rubricEvals = report.evals.filter((e) => e.trials.some((t) => t.rubric));
169
+ if (rubricEvals.length === 0)
170
+ continue;
171
+ md += `\n## Rubric Details: ${report.provider}/${report.model}\n\n`;
172
+ for (const evalResult of rubricEvals) {
173
+ md += `### ${evalResult.name}\n\n`;
174
+ // Show criteria from first trial that has rubric
175
+ const trial = evalResult.trials.find((t) => t.rubric);
176
+ if (!trial?.rubric)
177
+ continue;
178
+ for (const criterion of trial.rubric.criteria) {
179
+ const icon = criterion.passed ? "PASS" : "FAIL";
180
+ md += `- **${criterion.name}**: ${icon}`;
181
+ if (!criterion.passed) {
182
+ md += ` -- ${criterion.explanation}`;
183
+ }
184
+ md += "\n";
185
+ }
186
+ md += "\n";
187
+ }
188
+ }
189
+ // ─── Failed trials with transcripts and trace IDs ─────────────────
190
+ for (const report of models) {
191
+ const failedEvals = report.evals.filter((e) => e.trials.some((t) => !t.passed));
192
+ if (failedEvals.length === 0)
193
+ continue;
194
+ md += `\n## Failed Trials: ${report.provider}/${report.model}\n\n`;
195
+ for (const evalResult of failedEvals) {
196
+ const failedTrials = evalResult.trials.filter((t) => !t.passed);
197
+ for (const trial of failedTrials) {
198
+ md += `### ${evalResult.name} -- Trial ${trial.trial} (score: ${trial.score.toFixed(2)})\n\n`;
199
+ // Trace IDs for Grafana/Tempo lookup
200
+ const traceIds = trial.turns.map((t) => t.traceId).filter(Boolean);
201
+ if (traceIds.length > 0) {
202
+ md += `**Trace IDs:** ${traceIds.map((id) => `\`${id}\``).join(", ")}\n\n`;
203
+ }
204
+ // Failed assertions
205
+ for (const turn of trial.turns) {
206
+ const failed = turn.assertions.filter((a) => !a.passed);
207
+ if (failed.length === 0)
208
+ continue;
209
+ md += `**User:** ${turn.user}\n\n`;
210
+ md += `**Agent:** ${turn.agent.slice(0, 500)}${turn.agent.length > 500 ? "..." : ""}\n\n`;
211
+ for (const assertion of failed) {
212
+ md += `- **${assertion.type}**: FAIL`;
213
+ if (assertion.reason)
214
+ md += ` -- ${assertion.reason}`;
215
+ md += "\n";
216
+ }
217
+ md += "\n";
218
+ }
219
+ // Rubric failures
220
+ if (trial.rubric) {
221
+ const failedCriteria = trial.rubric.criteria.filter((c) => !c.passed);
222
+ if (failedCriteria.length > 0) {
223
+ md += "**Rubric failures:**\n";
224
+ for (const c of failedCriteria) {
225
+ md += `- **${c.name}**: ${c.explanation}\n`;
226
+ }
227
+ md += "\n";
228
+ }
229
+ }
230
+ }
231
+ }
232
+ }
233
+ return md;
234
+ }
235
+ export async function writeMarkdownReport(evalsDir, currentReport) {
236
+ const md = await generateComparisonReport(evalsDir, currentReport);
237
+ const reportPath = join(evalsDir, "evals-report.md");
238
+ await writeFile(reportPath, md);
239
+ console.log(chalk.dim(`Report written to ${reportPath}`));
240
+ return reportPath;
241
+ }
242
+ //# sourceMappingURL=reporter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reporter.js","sourceRoot":"","sources":["../../src/eval/reporter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACvE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,KAAK,MAAM,OAAO,CAAC;AAG1B,0EAA0E;AAE1E,MAAM,UAAU,WAAW,CAAC,MAAkB;IAC5C,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,YAAY,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACpE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,KAAK,CAAC,MAAM,UAAU,CAAC,CAAC,CAAC;IAEhE,KAAK,MAAM,UAAU,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACtC,SAAS,CAAC,UAAU,CAAC,CAAC;IACxB,CAAC;IAED,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC;IACjD,MAAM,YAAY,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IAC5D,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,cAAc,MAAM,IAAI,KAAK,eAAe,CAAC,CAAC,CAAC;IACxE,IAAI,MAAM,GAAG,CAAC,EAAE,CAAC;QACf,OAAO,CAAC,GAAG,CACT,KAAK,CAAC,GAAG,CACP,aAAa,MAAM,CAAC,KAAK;aACtB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,GAAG,KAAK,CAAC;aACrC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;aAClB,IAAI,CAAC,IAAI,CAAC,EAAE,CAChB,CACF,CAAC;IACJ,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC;AAED,SAAS,SAAS,CAAC,MAAkB;IACnC,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC;IACxC,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IAEjE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,IAAI,KAAK,UAAU,UAAU,CAAC,CAAC,CAAC;IAEjE,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;QAClC,UAAU,CAAC,KAAK,CAAC,CAAC;IACpB,CAAC;IAED,MAAM,WAAW,GAAG,MAAM,CAAC,QAAQ,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IACrE,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,MAAM,SAAS,GAAG,MAAM,CAAC,WAAW,CAAC,WAAW;QAC9C,CAAC,CAAC,WAAW,MAAM,CAAC,WAAW,CAAC,WAAW,EAAE;QAC7C,CAAC,CAAC,EAAE,CAAC;IACP,OAAO,CAAC,GAAG,CACT,WAAW,CACT,KAAK,MAAM,IAAI,WAAW,IAAI,UAAU,QAAQ,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,MAAM,CAAC,YAAY,KAAK,SAAS,EAAE,CACtH,CACF,CAAC;IACF,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC;AAED,SAAS,UAAU,CAAC,KAAkB;IACpC,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC9D,MAAM,OAAO,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CACT,KAAK,IAAI,UAAU,KAAK,CAAC,KAAK,KAAK,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,OAAO,EAAE,CACvE,CAAC;IAEF,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;QAC/B,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACxC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;gBACtB,OAAO,CAAC,GAAG,CACT,KAAK,CAAC,GAAG,CAAC,SAAS,SAAS,CAAC,IAAI,KAAK,SAAS,CAAC,MAAM,IAAI,MAAM,EAAE,CAAC,CACpE,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;QACjB,KAAK,MAAM,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;YAC9C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACnE,OAAO,CAAC,GAAG,CAAC,OAAO,KAAK,IAAI,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC;YAC9C,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;gBACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,SAAS,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;IACH,CAAC;IAED,oEAAoE;IACpE,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;QAClB,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACnE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,eAAe,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,0EAA0E;AAE1E,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,QAAgB,EAChB,MAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC9C,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,MAAM,IAAI,GAAG,GAAG,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC,OAAO,CACvD,cAAc,EACd,GAAG,CACJ,CAAC;IACF,MAAM,EAAE,GAAG,MAAM,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC/D,MAAM,QAAQ,GAAG,GAAG,IAAI,IAAI,EAAE,OAAO,CAAC;IACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAE5C,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC3D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,oBAAoB,QAAQ,EAAE,CAAC,CAAC,CAAC;IACvD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,0EAA0E;AAE1E,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAkB,EAClB,UAAkB;IAElB,MAAM,SAAS,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC7D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,sBAAsB,UAAU,EAAE,CAAC,CAAC,CAAC;AAC7D,CAAC;AAED,0EAA0E;AAE1E,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB;IAEhB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC9C,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;QACxC,MAAM,SAAS,GAAG,KAAK;aACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;aAClC,IAAI,EAAE;aACN,OAAO,EAAE,CAAC,CAAC,eAAe;QAE7B,MAAM,OAAO,GAAiB,EAAE,CAAC;QACjC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAe,CAAC,CAAC;QAClD,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,OAAqB;IAC3C,MAAM,OAAO,GAAG,IAAI,GAAG,EAAsB,CAAC;IAC9C,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjD,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAClC,IAAI,CAAC,QAAQ,IAAI,MAAM,CAAC,SAAS,GAAG,QAAQ,CAAC,SAAS,EAAE,CAAC;YACvD,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;AACtC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC5C,QAAgB,EAChB,aAA0B;IAE1B,MAAM,UAAU,GAAG,MAAM,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IACpD,IAAI,aAAa;QAAE,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAErD,MAAM,MAAM,GAAG,cAAc,CAAC,UAAU,CAAC,CAAC;IAE1C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,wBAAwB,CAAC;IAClC,CAAC;IAED,2CAA2C;IAC3C,MAAM,SAAS,GAAG;QAChB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;KAC9D,CAAC,IAAI,EAAE,CAAC;IAET,IAAI,EAAE,GAAG,mBAAmB,CAAC;IAC7B,EAAE,IAAI,cAAc,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,CAAC;IACjD,EAAE,IAAI,UAAU,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,IAAI,SAAS,MAAM,CAAC;IAEpD,mEAAmE;IACnE,EAAE,IAAI,yBAAyB,CAAC;IAChC,EAAE,IAAI,YAAY,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;IAClF,EAAE,IAAI,WAAW,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;IAE3D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YAC7B,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;YAC5D,IAAI,CAAC,UAAU;gBAAE,OAAO,GAAG,CAAC;YAC5B,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;YAC1D,OAAO,GAAG,IAAI,IAAI,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC;QACjG,CAAC,CAAC,CAAC;QACH,EAAE,IAAI,KAAK,QAAQ,MAAM,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;IACnD,CAAC;IAED,oEAAoE;IACpE,EAAE,IAAI,yBAAyB,CAAC;IAChC,EAAE,IAAI,kEAAkE,CAAC;IACzE,EAAE,IAAI,mCAAmC,CAAC;IAE1C,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC5B,MAAM,eAAe,GACnB,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC;YACrB,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,IAAI,GAAG,CAAC,CAAC,MAAM;gBACpD,MAAM,CAAC,KAAK,CAAC,MAAM;YACrB,CAAC,CAAC,CAAC,CAAC;QACR,MAAM,eAAe,GACnB,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC;YACrB,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;gBACpD,MAAM,CAAC,KAAK,CAAC,MAAM;YACrB,CAAC,CAAC,CAAC,CAAC;QACR,MAAM,UAAU,GACd,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC;YACrB,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC;gBACxD,MAAM,CAAC,KAAK,CAAC,MAAM;YACrB,CAAC,CAAC,CAAC,CAAC;QACR,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CACrC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,WAAW,EAAE,WAAW,IAAI,CAAC,CAAC,EACnD,CAAC,CACF,CAAC;QAEF,EAAE,IAAI,KAAK,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,KAAK,MAAM,IAAI,CAAC,KAAK,CAAC,eAAe,GAAG,GAAG,CAAC,OAAO,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,QAAQ,WAAW,CAAC,cAAc,EAAE,MAAM,CAAC;IAC3L,CAAC;IAED,oEAAoE;IACpE,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAC5C,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAC/B,CAAC;QACF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAEvC,EAAE,IAAI,wBAAwB,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,KAAK,MAAM,CAAC;QACpE,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;YACrC,EAAE,IAAI,OAAO,UAAU,CAAC,IAAI,MAAM,CAAC;YACnC,iDAAiD;YACjD,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;YACtD,IAAI,CAAC,KAAK,EAAE,MAAM;gBAAE,SAAS;YAE7B,KAAK,MAAM,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;gBAC9C,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;gBAChD,EAAE,IAAI,OAAO,SAAS,CAAC,IAAI,OAAO,IAAI,EAAE,CAAC;gBACzC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;oBACtB,EAAE,IAAI,OAAO,SAAS,CAAC,WAAW,EAAE,CAAC;gBACvC,CAAC;gBACD,EAAE,IAAI,IAAI,CAAC;YACb,CAAC;YACD,EAAE,IAAI,IAAI,CAAC;QACb,CAAC;IACH,CAAC;IAED,qEAAqE;IACrE,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAC5C,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAChC,CAAC;QACF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAEvC,EAAE,IAAI,uBAAuB,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,KAAK,MAAM,CAAC;QACnE,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;YACrC,MAAM,YAAY,GAAG,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;YAChE,KAAK,MAAM,KAAK,IAAI,YAAY,EAAE,CAAC;gBACjC,EAAE,IAAI,OAAO,UAAU,CAAC,IAAI,aAAa,KAAK,CAAC,KAAK,YAAY,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;gBAE9F,qCAAqC;gBACrC,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;gBACnE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACxB,EAAE,IAAI,kBAAkB,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;gBAC7E,CAAC;gBAED,oBAAoB;gBACpB,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;oBAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;oBACxD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;wBAAE,SAAS;oBAClC,EAAE,IAAI,aAAa,IAAI,CAAC,IAAI,MAAM,CAAC;oBACnC,EAAE,IAAI,cAAc,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC;oBAC1F,KAAK,MAAM,SAAS,IAAI,MAAM,EAAE,CAAC;wBAC/B,EAAE,IAAI,OAAO,SAAS,CAAC,IAAI,UAAU,CAAC;wBACtC,IAAI,SAAS,CAAC,MAAM;4BAAE,EAAE,IAAI,OAAO,SAAS,CAAC,MAAM,EAAE,CAAC;wBACtD,EAAE,IAAI,IAAI,CAAC;oBACb,CAAC;oBACD,EAAE,IAAI,IAAI,CAAC;gBACb,CAAC;gBAED,kBAAkB;gBAClB,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;oBACjB,MAAM,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;oBACtE,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC9B,EAAE,IAAI,wBAAwB,CAAC;wBAC/B,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;4BAC/B,EAAE,IAAI,OAAO,CAAC,CAAC,IAAI,OAAO,CAAC,CAAC,WAAW,IAAI,CAAC;wBAC9C,CAAC;wBACD,EAAE,IAAI,IAAI,CAAC;oBACb,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,QAAgB,EAChB,aAA0B;IAE1B,MAAM,EAAE,GAAG,MAAM,wBAAwB,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;IACnE,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,iBAAiB,CAAC,CAAC;IACrD,MAAM,SAAS,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAChC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC,CAAC;IAC1D,OAAO,UAAU,CAAC;AACpB,CAAC"}
@@ -0,0 +1,11 @@
1
+ import type { EvalDefinition, EvalResult } from "./types.js";
2
+ export interface RunOptions {
3
+ gatewayUrl: string;
4
+ authToken: string;
5
+ agentId?: string;
6
+ provider?: string;
7
+ model?: string;
8
+ trialsOverride?: number;
9
+ }
10
+ export declare function runEval(definition: EvalDefinition, evalFilePath: string, options: RunOptions): Promise<EvalResult>;
11
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAGV,cAAc,EACd,UAAU,EAKX,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,UAAU;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,wBAAsB,OAAO,CAC3B,UAAU,EAAE,cAAc,EAC1B,YAAY,EAAE,MAAM,EACpB,OAAO,EAAE,UAAU,GAClB,OAAO,CAAC,UAAU,CAAC,CAqCrB"}
@@ -0,0 +1,172 @@
1
+ import { readFile } from "node:fs/promises";
2
+ import { dirname, join } from "node:path";
3
+ import { createSession, deleteSession, sendAndCollect } from "./client.js";
4
+ import { gradeInline, gradeWithRubric } from "./grader.js";
5
+ export async function runEval(definition, evalFilePath, options) {
6
+ const trials = options.trialsOverride ?? definition.trials;
7
+ const results = [];
8
+ // Load rubric file if specified
9
+ let rubricContent;
10
+ if (definition.rubric) {
11
+ const rubricPath = join(dirname(evalFilePath), definition.rubric);
12
+ rubricContent = await readFile(rubricPath, "utf-8");
13
+ }
14
+ for (let i = 0; i < trials; i++) {
15
+ const result = await runTrial(i + 1, definition, rubricContent, options);
16
+ results.push(result);
17
+ }
18
+ const passedTrials = results.filter((t) => t.passed).length;
19
+ const latencies = results.flatMap((t) => t.turns.map((turn) => turn.latencyMs));
20
+ latencies.sort((a, b) => a - b);
21
+ // Aggregate token usage across all trials
22
+ const totalTokens = aggregateTokens(results);
23
+ return {
24
+ name: definition.name,
25
+ passRate: trials > 0 ? passedTrials / trials : 0,
26
+ avgScore: trials > 0 ? results.reduce((sum, t) => sum + t.score, 0) / trials : 0,
27
+ p50LatencyMs: latencies.length > 0
28
+ ? (latencies[Math.floor(latencies.length / 2)] ?? 0)
29
+ : 0,
30
+ totalTokens,
31
+ trials: results,
32
+ };
33
+ }
34
+ async function runTrial(trialNum, definition, rubricContent, options) {
35
+ const start = Date.now();
36
+ const timeoutMs = definition.timeout * 1000;
37
+ const session = await createSession(options.gatewayUrl, options.authToken, {
38
+ agentId: options.agentId,
39
+ forceNew: true,
40
+ dryRun: true,
41
+ });
42
+ const turnResults = [];
43
+ try {
44
+ for (const turn of definition.turns) {
45
+ const response = await sendAndCollect(session, turn.content, timeoutMs);
46
+ if (response.error) {
47
+ turnResults.push({
48
+ user: turn.content,
49
+ agent: response.text || `[Error: ${response.error}]`,
50
+ latencyMs: response.latencyMs,
51
+ assertions: [
52
+ { type: "error", passed: false, score: 0, reason: response.error },
53
+ ],
54
+ tokens: response.tokens,
55
+ traceId: response.traceId,
56
+ });
57
+ continue;
58
+ }
59
+ // Run assertions for this turn
60
+ const assertions = turn.assert
61
+ ? await runAssertions(turn.assert, response.text, options.gatewayUrl, options.authToken, timeoutMs)
62
+ : [];
63
+ turnResults.push({
64
+ user: turn.content,
65
+ agent: response.text,
66
+ latencyMs: response.latencyMs,
67
+ assertions,
68
+ tokens: response.tokens,
69
+ traceId: response.traceId,
70
+ });
71
+ }
72
+ // Run rubric grading on full transcript if rubric is specified
73
+ let rubric;
74
+ if (rubricContent) {
75
+ rubric = await gradeWithRubric(options.gatewayUrl, options.authToken, rubricContent, turnResults, timeoutMs);
76
+ }
77
+ // Calculate trial score
78
+ const score = calculateTrialScore(turnResults, rubric);
79
+ const passed = score >= definition.scoring.pass_threshold;
80
+ return {
81
+ trial: trialNum,
82
+ passed,
83
+ score,
84
+ turns: turnResults,
85
+ rubric,
86
+ durationMs: Date.now() - start,
87
+ };
88
+ }
89
+ finally {
90
+ await deleteSession(session);
91
+ }
92
+ }
93
+ async function runAssertions(assertions, agentResponse, gatewayUrl, authToken, timeoutMs) {
94
+ const results = [];
95
+ for (const assertion of assertions) {
96
+ switch (assertion.type) {
97
+ case "contains": {
98
+ const target = assertion.value;
99
+ const response = assertion.options?.case_insensitive
100
+ ? agentResponse.toLowerCase()
101
+ : agentResponse;
102
+ const search = assertion.options?.case_insensitive
103
+ ? target.toLowerCase()
104
+ : target;
105
+ const passed = response.includes(search);
106
+ results.push({ type: "contains", passed, score: passed ? 1 : 0 });
107
+ break;
108
+ }
109
+ case "regex": {
110
+ const regex = new RegExp(assertion.value, "i");
111
+ const passed = regex.test(agentResponse);
112
+ results.push({ type: "regex", passed, score: passed ? 1 : 0 });
113
+ break;
114
+ }
115
+ case "llm-rubric": {
116
+ const result = await gradeInline(gatewayUrl, authToken, assertion.value, agentResponse, timeoutMs);
117
+ results.push({
118
+ type: "llm-rubric",
119
+ passed: result.passed,
120
+ score: result.score,
121
+ reason: result.reason,
122
+ });
123
+ break;
124
+ }
125
+ }
126
+ }
127
+ return results;
128
+ }
129
+ function calculateTrialScore(turns, rubric) {
130
+ // Collect all weighted scores
131
+ const scores = [];
132
+ for (const turn of turns) {
133
+ if (turn.assertions.length === 0)
134
+ continue;
135
+ // If assertions have no explicit weights, weight them equally
136
+ const totalWeight = turn.assertions.reduce((sum, _a, _i) => sum + 1, // Default weight 1 per assertion
137
+ 0);
138
+ for (const assertion of turn.assertions) {
139
+ scores.push({ score: assertion.score, weight: 1 / totalWeight });
140
+ }
141
+ }
142
+ // Add rubric score if present (weighted equally to all assertion scores combined)
143
+ if (rubric) {
144
+ const assertionWeight = scores.length > 0 ? 0.5 : 1;
145
+ const rubricWeight = scores.length > 0 ? 0.5 : 1;
146
+ const assertionAvg = scores.length > 0
147
+ ? scores.reduce((sum, s) => sum + s.score * s.weight, 0)
148
+ : 0;
149
+ return assertionAvg * assertionWeight + rubric.score * rubricWeight;
150
+ }
151
+ if (scores.length === 0)
152
+ return 1; // No assertions = pass
153
+ return scores.reduce((sum, s) => sum + s.score * s.weight, 0);
154
+ }
155
+ function aggregateTokens(trials) {
156
+ let inputTokens = 0;
157
+ let outputTokens = 0;
158
+ for (const trial of trials) {
159
+ for (const turn of trial.turns) {
160
+ if (turn.tokens) {
161
+ inputTokens += turn.tokens.inputTokens ?? 0;
162
+ outputTokens += turn.tokens.outputTokens ?? 0;
163
+ }
164
+ }
165
+ }
166
+ return {
167
+ inputTokens,
168
+ outputTokens,
169
+ totalTokens: inputTokens + outputTokens,
170
+ };
171
+ }
172
+ //# sourceMappingURL=runner.js.map