@sebastiantuyu/agest 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/reporter.js CHANGED
@@ -23,6 +23,52 @@ export function formatReport(report) {
23
23
  if (reason) {
24
24
  lines.push(` reason: "${reason}"`);
25
25
  }
26
+ const result = report.results.find((r) => r.prompt === c);
27
+ if (result?.response.text) {
28
+ const escaped = result.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
29
+ lines.push(` response: "${escaped}"`);
30
+ }
31
+ }
32
+ }
33
+ // Suite breakdown
34
+ const suites = new Set(report.results.map((r) => r.suite).filter(Boolean));
35
+ if (suites.size > 0) {
36
+ lines.push(` suites:`);
37
+ for (const s of suites) {
38
+ const suiteResults = report.results.filter((r) => r.suite === s);
39
+ const suitePassed = suiteResults.filter((r) => r.passed).length;
40
+ const suiteRate = suiteResults.length > 0
41
+ ? Number((suitePassed / suiteResults.length).toFixed(2))
42
+ : 0;
43
+ lines.push(` - name: "${s}"`);
44
+ lines.push(` success_rate: ${suiteRate}`);
45
+ lines.push(` total_cases: ${suiteResults.length}`);
46
+ lines.push(` failed_cases_count: ${suiteResults.length - suitePassed}`);
47
+ if (suitePassed < suiteResults.length) {
48
+ lines.push(` failed_cases:`);
49
+ for (const r of suiteResults.filter((r) => !r.passed)) {
50
+ lines.push(` - "${r.prompt}"`);
51
+ if (r.error) {
52
+ lines.push(` reason: "${r.error}"`);
53
+ }
54
+ if (r.response.text) {
55
+ const escaped = r.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
56
+ lines.push(` response: "${escaped}"`);
57
+ }
58
+ }
59
+ }
60
+ }
61
+ }
62
+ // Statistical runs summary
63
+ const withRuns = report.results.filter((r) => r.runs && r.runs.length > 1);
64
+ if (withRuns.length > 0) {
65
+ lines.push(` statistical_runs:`);
66
+ for (const r of withRuns) {
67
+ const label = r.prompt.length > 50 ? r.prompt.slice(0, 47) + "..." : r.prompt;
68
+ lines.push(` - "${label}"`);
69
+ lines.push(` runs: ${r.runs.length}`);
70
+ lines.push(` pass_rate: ${((r.passRate ?? 0) * 100).toFixed(1)}%`);
71
+ lines.push(` significance: ${((r.statisticalSignificance ?? 0) * 100).toFixed(1)}%`);
26
72
  }
27
73
  }
28
74
  lines.push(` timestamp: "${report.timestamp}"`, ` duration: ${report.duration}`, ` total_cases: ${report.totalCases}`);
package/dist/reports.d.ts CHANGED
@@ -1,3 +1,14 @@
1
+ export interface ParsedSuiteResult {
2
+ name: string;
3
+ successRate: number;
4
+ totalCases: number;
5
+ failedCasesCount: number;
6
+ failedCases: Array<{
7
+ prompt: string;
8
+ reason?: string;
9
+ response?: string;
10
+ }>;
11
+ }
1
12
  export interface ParsedReport {
2
13
  name?: string;
3
14
  systemPromptHash?: string;
@@ -11,11 +22,13 @@ export interface ParsedReport {
11
22
  failedCases: Array<{
12
23
  prompt: string;
13
24
  reason?: string;
25
+ response?: string;
14
26
  }>;
15
27
  duration: number;
16
28
  timestamp: string;
17
29
  averageInputTokensPerCase?: number;
18
30
  averageOutputTokensPerCase?: number;
31
+ suites?: ParsedSuiteResult[];
19
32
  source: string;
20
33
  }
21
34
  export interface DiffEntry {
@@ -27,8 +40,10 @@ export declare function extractField(content: string, key: string): string | und
27
40
  export declare function parseFailedCases(content: string): Array<{
28
41
  prompt: string;
29
42
  reason?: string;
43
+ response?: string;
30
44
  }>;
31
45
  export declare function parseDimensions(content: string): Record<string, string> | undefined;
46
+ export declare function parseSuites(content: string): ParsedSuiteResult[] | undefined;
32
47
  export declare function parseReport(content: string, source: string): ParsedReport;
33
48
  export declare function findReports(dir: string, depth?: number): Promise<string[]>;
34
49
  export declare function loadDiffEntry(hash: string): Promise<DiffEntry | null>;
@@ -75,4 +90,9 @@ export declare function findVaryingDimensions(reports: ParsedReport[]): string[]
75
90
  * Group reports by the value of a specific dimension.
76
91
  */
77
92
  export declare function groupByDimension(reports: ParsedReport[], dimension: string): Map<string, ParsedReport[]>;
93
+ /**
94
+ * Wilson score interval lower bound at 95% confidence.
95
+ * Gives a conservative success rate estimate that accounts for sample size.
96
+ */
97
+ export declare function wilsonLowerBound(successRate: number, totalCases: number): number;
78
98
  export declare function formatDuration(ms: number): string;
package/dist/reports.js CHANGED
@@ -20,9 +20,21 @@ export function parseFailedCases(content) {
20
20
  break;
21
21
  const promptMatch = line.match(/^\s+- "(.+)"$/);
22
22
  if (promptMatch) {
23
- const next = lines[i + 1];
24
- const reasonMatch = next?.match(/^\s+reason: "(.+)"$/);
25
- cases.push({ prompt: promptMatch[1], reason: reasonMatch?.[1] });
23
+ let reason;
24
+ let response;
25
+ // Look ahead for reason and response fields
26
+ for (let j = i + 1; j < Math.min(i + 3, lines.length); j++) {
27
+ const next = lines[j];
28
+ if (!next || !next.match(/^\s+(reason|response):/))
29
+ break;
30
+ const reasonMatch = next.match(/^\s+reason: "(.+)"$/);
31
+ if (reasonMatch)
32
+ reason = reasonMatch[1];
33
+ const responseMatch = next.match(/^\s+response: "(.+)"$/);
34
+ if (responseMatch)
35
+ response = responseMatch[1].replace(/\\n/g, '\n').replace(/\\"/g, '"');
36
+ }
37
+ cases.push({ prompt: promptMatch[1], reason, response });
26
38
  }
27
39
  }
28
40
  return cases;
@@ -44,6 +56,74 @@ export function parseDimensions(content) {
44
56
  }
45
57
  return Object.keys(dims).length > 0 ? dims : undefined;
46
58
  }
59
+ export function parseSuites(content) {
60
+ const lines = content.split("\n");
61
+ const startIdx = lines.findIndex((l) => l.trim() === "suites:");
62
+ if (startIdx === -1)
63
+ return undefined;
64
+ const suites = [];
65
+ let current = null;
66
+ let parsingFailedCases = false;
67
+ for (let i = startIdx + 1; i < lines.length; i++) {
68
+ const line = lines[i];
69
+ // Stop if we exit the suites indentation level
70
+ if (line.length > 0 && !line.startsWith(" "))
71
+ break;
72
+ if (line.trim() === "")
73
+ continue;
74
+ const nameMatch = line.match(/^\s+- name: "(.+)"$/);
75
+ if (nameMatch) {
76
+ if (current)
77
+ suites.push(current);
78
+ current = { name: nameMatch[1], failedCases: [], failedCasesCount: 0 };
79
+ parsingFailedCases = false;
80
+ continue;
81
+ }
82
+ if (!current)
83
+ continue;
84
+ const srMatch = line.match(/^\s+success_rate: (.+)$/);
85
+ if (srMatch) {
86
+ current.successRate = parseFloat(srMatch[1]);
87
+ continue;
88
+ }
89
+ const tcMatch = line.match(/^\s+total_cases: (.+)$/);
90
+ if (tcMatch) {
91
+ current.totalCases = parseInt(tcMatch[1], 10);
92
+ continue;
93
+ }
94
+ const fccMatch = line.match(/^\s+failed_cases_count: (.+)$/);
95
+ if (fccMatch) {
96
+ current.failedCasesCount = parseInt(fccMatch[1], 10);
97
+ continue;
98
+ }
99
+ if (line.trim() === "failed_cases:") {
100
+ parsingFailedCases = true;
101
+ continue;
102
+ }
103
+ if (parsingFailedCases) {
104
+ const promptMatch = line.match(/^\s+- "(.+)"$/);
105
+ if (promptMatch) {
106
+ let reason;
107
+ let response;
108
+ for (let j = i + 1; j < Math.min(i + 3, lines.length); j++) {
109
+ const next = lines[j];
110
+ if (!next || !next.match(/^\s+(reason|response):/))
111
+ break;
112
+ const reasonMatch = next.match(/^\s+reason: "(.+)"$/);
113
+ if (reasonMatch)
114
+ reason = reasonMatch[1];
115
+ const responseMatch = next.match(/^\s+response: "(.+)"$/);
116
+ if (responseMatch)
117
+ response = responseMatch[1].replace(/\\n/g, '\n').replace(/\\"/g, '"');
118
+ }
119
+ current.failedCases.push({ prompt: promptMatch[1], reason, response });
120
+ }
121
+ }
122
+ }
123
+ if (current)
124
+ suites.push(current);
125
+ return suites.length > 0 ? suites : undefined;
126
+ }
47
127
  export function parseReport(content, source) {
48
128
  const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
49
129
  const avgIn = extractField(content, "average_input_tokens_per_case");
@@ -78,6 +158,7 @@ export function parseReport(content, source) {
78
158
  timestamp: extractField(content, "timestamp") ?? "",
79
159
  averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
80
160
  averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
161
+ suites: parseSuites(content),
81
162
  source,
82
163
  };
83
164
  }
@@ -267,6 +348,21 @@ export function groupByDimension(reports, dimension) {
267
348
  }
268
349
  return groups;
269
350
  }
351
+ /**
352
+ * Wilson score interval lower bound at 95% confidence.
353
+ * Gives a conservative success rate estimate that accounts for sample size.
354
+ */
355
+ export function wilsonLowerBound(successRate, totalCases) {
356
+ if (totalCases === 0)
357
+ return 0;
358
+ const z = 1.96;
359
+ const p = successRate;
360
+ const denominator = 1 + (z * z) / totalCases;
361
+ const centre = p + (z * z) / (2 * totalCases);
362
+ const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * totalCases)) / totalCases);
363
+ const lower = (centre - spread) / denominator;
364
+ return Math.max(0, Math.min(1, lower));
365
+ }
270
366
  export function formatDuration(ms) {
271
367
  if (ms < 1000)
272
368
  return `${ms.toFixed(0)}ms`;
package/dist/runner.js CHANGED
@@ -13,11 +13,26 @@ export function extractField(response, field) {
13
13
  return response.metadata?.[field];
14
14
  }
15
15
  }
16
- export async function executeScene(executor, scene, globalTimeout, judgeConfig, globalTurns) {
16
+ /**
17
+ * Compute Wilson score interval lower bound.
18
+ * Measures confidence that the true pass rate is above 50% (random chance).
19
+ * z = 1.96 for 95% confidence level.
20
+ */
21
+ function wilsonSignificance(passes, total) {
22
+ if (total === 0)
23
+ return 0;
24
+ const z = 1.96;
25
+ const p = passes / total;
26
+ const denominator = 1 + (z * z) / total;
27
+ const centre = p + (z * z) / (2 * total);
28
+ const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * total)) / total);
29
+ const lower = (centre - spread) / denominator;
30
+ // Return the lower bound clamped to [0, 1]
31
+ return Math.max(0, Math.min(1, lower));
32
+ }
33
+ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
17
34
  let response = { text: "" };
18
35
  let duration;
19
- const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
20
- const turns = scene.turns ?? globalTurns ?? 1;
21
36
  try {
22
37
  const start = performance.now();
23
38
  let input = scene.prompt;
@@ -38,21 +53,14 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
38
53
  }
39
54
  catch (err) {
40
55
  return {
41
- prompt: scene.prompt,
42
- response: { text: "", executionError: err.message },
43
- duration: 0,
44
56
  passed: false,
45
57
  error: err.message,
58
+ response: { text: "", executionError: err.message },
59
+ duration: 0,
46
60
  };
47
61
  }
48
62
  if (response.executionError) {
49
- return {
50
- prompt: scene.prompt,
51
- response,
52
- duration,
53
- passed: false,
54
- error: response.executionError,
55
- };
63
+ return { passed: false, error: response.executionError, response, duration };
56
64
  }
57
65
  let passed = true;
58
66
  let error;
@@ -94,5 +102,52 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
94
102
  }
95
103
  }
96
104
  }
97
- return { prompt: scene.prompt, response, duration, passed, error, judgement };
105
+ return { passed, error, response, duration, judgement };
106
+ }
107
+ export async function executeScene(executor, scene, globalTimeout, judgeConfig, globalTurns) {
108
+ const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
109
+ const turns = scene.turns ?? globalTurns ?? 1;
110
+ const numRuns = scene.runs ?? 1;
111
+ // Single run — original fast path
112
+ if (numRuns <= 1) {
113
+ const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
114
+ return {
115
+ prompt: scene.prompt,
116
+ response: run.response,
117
+ duration: run.duration,
118
+ passed: run.passed,
119
+ error: run.error,
120
+ judgement: run.judgement,
121
+ suite: scene.suite,
122
+ };
123
+ }
124
+ // Multiple runs — execute N times and aggregate
125
+ const runs = [];
126
+ for (let i = 0; i < numRuns; i++) {
127
+ runs.push(await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig));
128
+ }
129
+ const passes = runs.filter((r) => r.passed).length;
130
+ const passRate = passes / runs.length;
131
+ const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
132
+ const statisticalSignificance = wilsonSignificance(passes, runs.length);
133
+ // Use the last run's response as representative
134
+ const lastRun = runs[runs.length - 1];
135
+ // Overall pass = majority passed (> 50%)
136
+ const overallPassed = passRate > 0.5;
137
+ const failedRuns = runs.filter((r) => !r.passed);
138
+ const error = overallPassed
139
+ ? undefined
140
+ : failedRuns[0]?.error ?? "Majority of runs failed";
141
+ return {
142
+ prompt: scene.prompt,
143
+ response: lastRun.response,
144
+ duration: totalDuration,
145
+ passed: overallPassed,
146
+ error,
147
+ judgement: lastRun.judgement,
148
+ suite: scene.suite,
149
+ runs,
150
+ passRate,
151
+ statisticalSignificance,
152
+ };
98
153
  }
package/dist/stats.js CHANGED
@@ -197,6 +197,8 @@ async function main() {
197
197
  const args = process.argv.slice(2);
198
198
  const agentFlagIdx = args.indexOf("--agent");
199
199
  const agentFilter = agentFlagIdx !== -1 ? args[agentFlagIdx + 1] : undefined;
200
+ const modelFlagIdx = args.indexOf("--model");
201
+ const modelFilter = modelFlagIdx !== -1 ? args[modelFlagIdx + 1] : undefined;
200
202
  if (args.includes("--purge")) {
201
203
  await purge(process.cwd());
202
204
  return;
@@ -220,8 +222,15 @@ async function main() {
220
222
  return;
221
223
  }
222
224
  }
225
+ if (modelFilter) {
226
+ reports = reports.filter((r) => r.model.toLowerCase() === modelFilter.toLowerCase());
227
+ if (reports.length === 0) {
228
+ console.log(`\n No reports found for model "${modelFilter}".\n`);
229
+ return;
230
+ }
231
+ }
223
232
  console.log("\n" + "━".repeat(W));
224
- const filterLabel = agentFilter ? ` · agent: ${agentFilter}` : "";
233
+ const filterLabel = (agentFilter ? ` · agent: ${agentFilter}` : "") + (modelFilter ? ` · model: ${modelFilter}` : "");
225
234
  console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found${filterLabel}`);
226
235
  console.log("━".repeat(W));
227
236
  // Aggregate by model
@@ -250,6 +259,29 @@ async function main() {
250
259
  value: a.avgSuccessRate,
251
260
  display: `${(a.avgSuccessRate * 100).toFixed(0).padStart(3)}%`,
252
261
  })), 1);
262
+ // Suite breakdown (aggregate across all reports that have suites)
263
+ const withSuites = reports.filter((r) => r.suites && r.suites.length > 0);
264
+ if (withSuites.length > 0) {
265
+ const suiteAgg = new Map();
266
+ for (const r of withSuites) {
267
+ for (const s of r.suites) {
268
+ const arr = suiteAgg.get(s.name) ?? [];
269
+ arr.push(s.successRate);
270
+ suiteAgg.set(s.name, arr);
271
+ }
272
+ }
273
+ const suiteRows = [...suiteAgg.entries()]
274
+ .map(([name, rates]) => {
275
+ const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
276
+ return {
277
+ label: name,
278
+ value: avgRate,
279
+ display: `${(avgRate * 100).toFixed(0).padStart(3)}%`,
280
+ };
281
+ })
282
+ .sort((a, b) => b.value - a.value);
283
+ printSection("Suite Breakdown", suiteRows, 1);
284
+ }
253
285
  // Token charts (only when data is present)
254
286
  const withTokens = agg.filter((a) => a.avgInputTokens != null && a.avgOutputTokens != null);
255
287
  if (withTokens.length > 0) {
package/dist/types.d.ts CHANGED
@@ -14,6 +14,7 @@ export interface AgentResponse {
14
14
  [key: string]: unknown;
15
15
  };
16
16
  }
17
+ export type HookFn = () => void | Promise<void>;
17
18
  export interface SceneDefinition {
18
19
  prompt: string;
19
20
  assertions: Array<{
@@ -22,6 +23,8 @@ export interface SceneDefinition {
22
23
  }>;
23
24
  timeout?: number;
24
25
  turns?: number;
26
+ runs?: number;
27
+ suite?: string;
25
28
  }
26
29
  export type JudgeVerdict = "pass" | "fail" | "partial";
27
30
  export interface JudgeResult {
@@ -29,6 +32,13 @@ export interface JudgeResult {
29
32
  reasoning: string;
30
33
  criteria: string;
31
34
  }
35
+ export interface RunResult {
36
+ passed: boolean;
37
+ error?: string;
38
+ response: AgentResponse;
39
+ duration: number;
40
+ judgement?: JudgeResult;
41
+ }
32
42
  export interface SceneResult {
33
43
  prompt: string;
34
44
  response: AgentResponse;
@@ -36,6 +46,10 @@ export interface SceneResult {
36
46
  passed: boolean;
37
47
  error?: string;
38
48
  judgement?: JudgeResult;
49
+ suite?: string;
50
+ runs?: RunResult[];
51
+ passRate?: number;
52
+ statisticalSignificance?: number;
39
53
  }
40
54
  export interface AgentReport {
41
55
  name?: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sebastiantuyu/agest",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "A testing library for agents",
5
5
  "repository": {
6
6
  "type": "git",