@sebastiantuyu/agest 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +3 -1
- package/dist/context.d.ts +14 -1
- package/dist/context.js +95 -11
- package/dist/index.d.ts +7 -2
- package/dist/index.js +26 -1
- package/dist/preview.js +670 -158
- package/dist/reporter.js +46 -0
- package/dist/reports.d.ts +20 -0
- package/dist/reports.js +99 -3
- package/dist/runner.js +69 -14
- package/dist/stats.js +33 -1
- package/dist/types.d.ts +14 -0
- package/package.json +1 -1
package/dist/reporter.js
CHANGED
|
@@ -23,6 +23,52 @@ export function formatReport(report) {
|
|
|
23
23
|
if (reason) {
|
|
24
24
|
lines.push(` reason: "${reason}"`);
|
|
25
25
|
}
|
|
26
|
+
const result = report.results.find((r) => r.prompt === c);
|
|
27
|
+
if (result?.response.text) {
|
|
28
|
+
const escaped = result.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
29
|
+
lines.push(` response: "${escaped}"`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
// Suite breakdown
|
|
34
|
+
const suites = new Set(report.results.map((r) => r.suite).filter(Boolean));
|
|
35
|
+
if (suites.size > 0) {
|
|
36
|
+
lines.push(` suites:`);
|
|
37
|
+
for (const s of suites) {
|
|
38
|
+
const suiteResults = report.results.filter((r) => r.suite === s);
|
|
39
|
+
const suitePassed = suiteResults.filter((r) => r.passed).length;
|
|
40
|
+
const suiteRate = suiteResults.length > 0
|
|
41
|
+
? Number((suitePassed / suiteResults.length).toFixed(2))
|
|
42
|
+
: 0;
|
|
43
|
+
lines.push(` - name: "${s}"`);
|
|
44
|
+
lines.push(` success_rate: ${suiteRate}`);
|
|
45
|
+
lines.push(` total_cases: ${suiteResults.length}`);
|
|
46
|
+
lines.push(` failed_cases_count: ${suiteResults.length - suitePassed}`);
|
|
47
|
+
if (suitePassed < suiteResults.length) {
|
|
48
|
+
lines.push(` failed_cases:`);
|
|
49
|
+
for (const r of suiteResults.filter((r) => !r.passed)) {
|
|
50
|
+
lines.push(` - "${r.prompt}"`);
|
|
51
|
+
if (r.error) {
|
|
52
|
+
lines.push(` reason: "${r.error}"`);
|
|
53
|
+
}
|
|
54
|
+
if (r.response.text) {
|
|
55
|
+
const escaped = r.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
56
|
+
lines.push(` response: "${escaped}"`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
// Statistical runs summary
|
|
63
|
+
const withRuns = report.results.filter((r) => r.runs && r.runs.length > 1);
|
|
64
|
+
if (withRuns.length > 0) {
|
|
65
|
+
lines.push(` statistical_runs:`);
|
|
66
|
+
for (const r of withRuns) {
|
|
67
|
+
const label = r.prompt.length > 50 ? r.prompt.slice(0, 47) + "..." : r.prompt;
|
|
68
|
+
lines.push(` - "${label}"`);
|
|
69
|
+
lines.push(` runs: ${r.runs.length}`);
|
|
70
|
+
lines.push(` pass_rate: ${((r.passRate ?? 0) * 100).toFixed(1)}%`);
|
|
71
|
+
lines.push(` significance: ${((r.statisticalSignificance ?? 0) * 100).toFixed(1)}%`);
|
|
26
72
|
}
|
|
27
73
|
}
|
|
28
74
|
lines.push(` timestamp: "${report.timestamp}"`, ` duration: ${report.duration}`, ` total_cases: ${report.totalCases}`);
|
package/dist/reports.d.ts
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
export interface ParsedSuiteResult {
|
|
2
|
+
name: string;
|
|
3
|
+
successRate: number;
|
|
4
|
+
totalCases: number;
|
|
5
|
+
failedCasesCount: number;
|
|
6
|
+
failedCases: Array<{
|
|
7
|
+
prompt: string;
|
|
8
|
+
reason?: string;
|
|
9
|
+
response?: string;
|
|
10
|
+
}>;
|
|
11
|
+
}
|
|
1
12
|
export interface ParsedReport {
|
|
2
13
|
name?: string;
|
|
3
14
|
systemPromptHash?: string;
|
|
@@ -11,11 +22,13 @@ export interface ParsedReport {
|
|
|
11
22
|
failedCases: Array<{
|
|
12
23
|
prompt: string;
|
|
13
24
|
reason?: string;
|
|
25
|
+
response?: string;
|
|
14
26
|
}>;
|
|
15
27
|
duration: number;
|
|
16
28
|
timestamp: string;
|
|
17
29
|
averageInputTokensPerCase?: number;
|
|
18
30
|
averageOutputTokensPerCase?: number;
|
|
31
|
+
suites?: ParsedSuiteResult[];
|
|
19
32
|
source: string;
|
|
20
33
|
}
|
|
21
34
|
export interface DiffEntry {
|
|
@@ -27,8 +40,10 @@ export declare function extractField(content: string, key: string): string | und
|
|
|
27
40
|
export declare function parseFailedCases(content: string): Array<{
|
|
28
41
|
prompt: string;
|
|
29
42
|
reason?: string;
|
|
43
|
+
response?: string;
|
|
30
44
|
}>;
|
|
31
45
|
export declare function parseDimensions(content: string): Record<string, string> | undefined;
|
|
46
|
+
export declare function parseSuites(content: string): ParsedSuiteResult[] | undefined;
|
|
32
47
|
export declare function parseReport(content: string, source: string): ParsedReport;
|
|
33
48
|
export declare function findReports(dir: string, depth?: number): Promise<string[]>;
|
|
34
49
|
export declare function loadDiffEntry(hash: string): Promise<DiffEntry | null>;
|
|
@@ -75,4 +90,9 @@ export declare function findVaryingDimensions(reports: ParsedReport[]): string[]
|
|
|
75
90
|
* Group reports by the value of a specific dimension.
|
|
76
91
|
*/
|
|
77
92
|
export declare function groupByDimension(reports: ParsedReport[], dimension: string): Map<string, ParsedReport[]>;
|
|
93
|
+
/**
|
|
94
|
+
* Wilson score interval lower bound at 95% confidence.
|
|
95
|
+
* Gives a conservative success rate estimate that accounts for sample size.
|
|
96
|
+
*/
|
|
97
|
+
export declare function wilsonLowerBound(successRate: number, totalCases: number): number;
|
|
78
98
|
export declare function formatDuration(ms: number): string;
|
package/dist/reports.js
CHANGED
|
@@ -20,9 +20,21 @@ export function parseFailedCases(content) {
|
|
|
20
20
|
break;
|
|
21
21
|
const promptMatch = line.match(/^\s+- "(.+)"$/);
|
|
22
22
|
if (promptMatch) {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
let reason;
|
|
24
|
+
let response;
|
|
25
|
+
// Look ahead for reason and response fields
|
|
26
|
+
for (let j = i + 1; j < Math.min(i + 3, lines.length); j++) {
|
|
27
|
+
const next = lines[j];
|
|
28
|
+
if (!next || !next.match(/^\s+(reason|response):/))
|
|
29
|
+
break;
|
|
30
|
+
const reasonMatch = next.match(/^\s+reason: "(.+)"$/);
|
|
31
|
+
if (reasonMatch)
|
|
32
|
+
reason = reasonMatch[1];
|
|
33
|
+
const responseMatch = next.match(/^\s+response: "(.+)"$/);
|
|
34
|
+
if (responseMatch)
|
|
35
|
+
response = responseMatch[1].replace(/\\n/g, '\n').replace(/\\"/g, '"');
|
|
36
|
+
}
|
|
37
|
+
cases.push({ prompt: promptMatch[1], reason, response });
|
|
26
38
|
}
|
|
27
39
|
}
|
|
28
40
|
return cases;
|
|
@@ -44,6 +56,74 @@ export function parseDimensions(content) {
|
|
|
44
56
|
}
|
|
45
57
|
return Object.keys(dims).length > 0 ? dims : undefined;
|
|
46
58
|
}
|
|
59
|
+
export function parseSuites(content) {
|
|
60
|
+
const lines = content.split("\n");
|
|
61
|
+
const startIdx = lines.findIndex((l) => l.trim() === "suites:");
|
|
62
|
+
if (startIdx === -1)
|
|
63
|
+
return undefined;
|
|
64
|
+
const suites = [];
|
|
65
|
+
let current = null;
|
|
66
|
+
let parsingFailedCases = false;
|
|
67
|
+
for (let i = startIdx + 1; i < lines.length; i++) {
|
|
68
|
+
const line = lines[i];
|
|
69
|
+
// Stop if we exit the suites indentation level
|
|
70
|
+
if (line.length > 0 && !line.startsWith(" "))
|
|
71
|
+
break;
|
|
72
|
+
if (line.trim() === "")
|
|
73
|
+
continue;
|
|
74
|
+
const nameMatch = line.match(/^\s+- name: "(.+)"$/);
|
|
75
|
+
if (nameMatch) {
|
|
76
|
+
if (current)
|
|
77
|
+
suites.push(current);
|
|
78
|
+
current = { name: nameMatch[1], failedCases: [], failedCasesCount: 0 };
|
|
79
|
+
parsingFailedCases = false;
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
if (!current)
|
|
83
|
+
continue;
|
|
84
|
+
const srMatch = line.match(/^\s+success_rate: (.+)$/);
|
|
85
|
+
if (srMatch) {
|
|
86
|
+
current.successRate = parseFloat(srMatch[1]);
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
const tcMatch = line.match(/^\s+total_cases: (.+)$/);
|
|
90
|
+
if (tcMatch) {
|
|
91
|
+
current.totalCases = parseInt(tcMatch[1], 10);
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
const fccMatch = line.match(/^\s+failed_cases_count: (.+)$/);
|
|
95
|
+
if (fccMatch) {
|
|
96
|
+
current.failedCasesCount = parseInt(fccMatch[1], 10);
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
if (line.trim() === "failed_cases:") {
|
|
100
|
+
parsingFailedCases = true;
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
if (parsingFailedCases) {
|
|
104
|
+
const promptMatch = line.match(/^\s+- "(.+)"$/);
|
|
105
|
+
if (promptMatch) {
|
|
106
|
+
let reason;
|
|
107
|
+
let response;
|
|
108
|
+
for (let j = i + 1; j < Math.min(i + 3, lines.length); j++) {
|
|
109
|
+
const next = lines[j];
|
|
110
|
+
if (!next || !next.match(/^\s+(reason|response):/))
|
|
111
|
+
break;
|
|
112
|
+
const reasonMatch = next.match(/^\s+reason: "(.+)"$/);
|
|
113
|
+
if (reasonMatch)
|
|
114
|
+
reason = reasonMatch[1];
|
|
115
|
+
const responseMatch = next.match(/^\s+response: "(.+)"$/);
|
|
116
|
+
if (responseMatch)
|
|
117
|
+
response = responseMatch[1].replace(/\\n/g, '\n').replace(/\\"/g, '"');
|
|
118
|
+
}
|
|
119
|
+
current.failedCases.push({ prompt: promptMatch[1], reason, response });
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
if (current)
|
|
124
|
+
suites.push(current);
|
|
125
|
+
return suites.length > 0 ? suites : undefined;
|
|
126
|
+
}
|
|
47
127
|
export function parseReport(content, source) {
|
|
48
128
|
const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
|
|
49
129
|
const avgIn = extractField(content, "average_input_tokens_per_case");
|
|
@@ -78,6 +158,7 @@ export function parseReport(content, source) {
|
|
|
78
158
|
timestamp: extractField(content, "timestamp") ?? "",
|
|
79
159
|
averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
|
|
80
160
|
averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
|
|
161
|
+
suites: parseSuites(content),
|
|
81
162
|
source,
|
|
82
163
|
};
|
|
83
164
|
}
|
|
@@ -267,6 +348,21 @@ export function groupByDimension(reports, dimension) {
|
|
|
267
348
|
}
|
|
268
349
|
return groups;
|
|
269
350
|
}
|
|
351
|
+
/**
|
|
352
|
+
* Wilson score interval lower bound at 95% confidence.
|
|
353
|
+
* Gives a conservative success rate estimate that accounts for sample size.
|
|
354
|
+
*/
|
|
355
|
+
export function wilsonLowerBound(successRate, totalCases) {
|
|
356
|
+
if (totalCases === 0)
|
|
357
|
+
return 0;
|
|
358
|
+
const z = 1.96;
|
|
359
|
+
const p = successRate;
|
|
360
|
+
const denominator = 1 + (z * z) / totalCases;
|
|
361
|
+
const centre = p + (z * z) / (2 * totalCases);
|
|
362
|
+
const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * totalCases)) / totalCases);
|
|
363
|
+
const lower = (centre - spread) / denominator;
|
|
364
|
+
return Math.max(0, Math.min(1, lower));
|
|
365
|
+
}
|
|
270
366
|
export function formatDuration(ms) {
|
|
271
367
|
if (ms < 1000)
|
|
272
368
|
return `${ms.toFixed(0)}ms`;
|
package/dist/runner.js
CHANGED
|
@@ -13,11 +13,26 @@ export function extractField(response, field) {
|
|
|
13
13
|
return response.metadata?.[field];
|
|
14
14
|
}
|
|
15
15
|
}
|
|
16
|
-
|
|
16
|
+
/**
|
|
17
|
+
* Compute Wilson score interval lower bound.
|
|
18
|
+
* Measures confidence that the true pass rate is above 50% (random chance).
|
|
19
|
+
* z = 1.96 for 95% confidence level.
|
|
20
|
+
*/
|
|
21
|
+
function wilsonSignificance(passes, total) {
|
|
22
|
+
if (total === 0)
|
|
23
|
+
return 0;
|
|
24
|
+
const z = 1.96;
|
|
25
|
+
const p = passes / total;
|
|
26
|
+
const denominator = 1 + (z * z) / total;
|
|
27
|
+
const centre = p + (z * z) / (2 * total);
|
|
28
|
+
const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * total)) / total);
|
|
29
|
+
const lower = (centre - spread) / denominator;
|
|
30
|
+
// Return the lower bound clamped to [0, 1]
|
|
31
|
+
return Math.max(0, Math.min(1, lower));
|
|
32
|
+
}
|
|
33
|
+
async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
|
|
17
34
|
let response = { text: "" };
|
|
18
35
|
let duration;
|
|
19
|
-
const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
|
|
20
|
-
const turns = scene.turns ?? globalTurns ?? 1;
|
|
21
36
|
try {
|
|
22
37
|
const start = performance.now();
|
|
23
38
|
let input = scene.prompt;
|
|
@@ -38,21 +53,14 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
38
53
|
}
|
|
39
54
|
catch (err) {
|
|
40
55
|
return {
|
|
41
|
-
prompt: scene.prompt,
|
|
42
|
-
response: { text: "", executionError: err.message },
|
|
43
|
-
duration: 0,
|
|
44
56
|
passed: false,
|
|
45
57
|
error: err.message,
|
|
58
|
+
response: { text: "", executionError: err.message },
|
|
59
|
+
duration: 0,
|
|
46
60
|
};
|
|
47
61
|
}
|
|
48
62
|
if (response.executionError) {
|
|
49
|
-
return {
|
|
50
|
-
prompt: scene.prompt,
|
|
51
|
-
response,
|
|
52
|
-
duration,
|
|
53
|
-
passed: false,
|
|
54
|
-
error: response.executionError,
|
|
55
|
-
};
|
|
63
|
+
return { passed: false, error: response.executionError, response, duration };
|
|
56
64
|
}
|
|
57
65
|
let passed = true;
|
|
58
66
|
let error;
|
|
@@ -94,5 +102,52 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
94
102
|
}
|
|
95
103
|
}
|
|
96
104
|
}
|
|
97
|
-
return {
|
|
105
|
+
return { passed, error, response, duration, judgement };
|
|
106
|
+
}
|
|
107
|
+
export async function executeScene(executor, scene, globalTimeout, judgeConfig, globalTurns) {
|
|
108
|
+
const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
|
|
109
|
+
const turns = scene.turns ?? globalTurns ?? 1;
|
|
110
|
+
const numRuns = scene.runs ?? 1;
|
|
111
|
+
// Single run — original fast path
|
|
112
|
+
if (numRuns <= 1) {
|
|
113
|
+
const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
|
|
114
|
+
return {
|
|
115
|
+
prompt: scene.prompt,
|
|
116
|
+
response: run.response,
|
|
117
|
+
duration: run.duration,
|
|
118
|
+
passed: run.passed,
|
|
119
|
+
error: run.error,
|
|
120
|
+
judgement: run.judgement,
|
|
121
|
+
suite: scene.suite,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
// Multiple runs — execute N times and aggregate
|
|
125
|
+
const runs = [];
|
|
126
|
+
for (let i = 0; i < numRuns; i++) {
|
|
127
|
+
runs.push(await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig));
|
|
128
|
+
}
|
|
129
|
+
const passes = runs.filter((r) => r.passed).length;
|
|
130
|
+
const passRate = passes / runs.length;
|
|
131
|
+
const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
|
|
132
|
+
const statisticalSignificance = wilsonSignificance(passes, runs.length);
|
|
133
|
+
// Use the last run's response as representative
|
|
134
|
+
const lastRun = runs[runs.length - 1];
|
|
135
|
+
// Overall pass = majority passed (> 50%)
|
|
136
|
+
const overallPassed = passRate > 0.5;
|
|
137
|
+
const failedRuns = runs.filter((r) => !r.passed);
|
|
138
|
+
const error = overallPassed
|
|
139
|
+
? undefined
|
|
140
|
+
: failedRuns[0]?.error ?? "Majority of runs failed";
|
|
141
|
+
return {
|
|
142
|
+
prompt: scene.prompt,
|
|
143
|
+
response: lastRun.response,
|
|
144
|
+
duration: totalDuration,
|
|
145
|
+
passed: overallPassed,
|
|
146
|
+
error,
|
|
147
|
+
judgement: lastRun.judgement,
|
|
148
|
+
suite: scene.suite,
|
|
149
|
+
runs,
|
|
150
|
+
passRate,
|
|
151
|
+
statisticalSignificance,
|
|
152
|
+
};
|
|
98
153
|
}
|
package/dist/stats.js
CHANGED
|
@@ -197,6 +197,8 @@ async function main() {
|
|
|
197
197
|
const args = process.argv.slice(2);
|
|
198
198
|
const agentFlagIdx = args.indexOf("--agent");
|
|
199
199
|
const agentFilter = agentFlagIdx !== -1 ? args[agentFlagIdx + 1] : undefined;
|
|
200
|
+
const modelFlagIdx = args.indexOf("--model");
|
|
201
|
+
const modelFilter = modelFlagIdx !== -1 ? args[modelFlagIdx + 1] : undefined;
|
|
200
202
|
if (args.includes("--purge")) {
|
|
201
203
|
await purge(process.cwd());
|
|
202
204
|
return;
|
|
@@ -220,8 +222,15 @@ async function main() {
|
|
|
220
222
|
return;
|
|
221
223
|
}
|
|
222
224
|
}
|
|
225
|
+
if (modelFilter) {
|
|
226
|
+
reports = reports.filter((r) => r.model.toLowerCase() === modelFilter.toLowerCase());
|
|
227
|
+
if (reports.length === 0) {
|
|
228
|
+
console.log(`\n No reports found for model "${modelFilter}".\n`);
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
223
232
|
console.log("\n" + "━".repeat(W));
|
|
224
|
-
const filterLabel = agentFilter ? ` · agent: ${agentFilter}` : "";
|
|
233
|
+
const filterLabel = (agentFilter ? ` · agent: ${agentFilter}` : "") + (modelFilter ? ` · model: ${modelFilter}` : "");
|
|
225
234
|
console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found${filterLabel}`);
|
|
226
235
|
console.log("━".repeat(W));
|
|
227
236
|
// Aggregate by model
|
|
@@ -250,6 +259,29 @@ async function main() {
|
|
|
250
259
|
value: a.avgSuccessRate,
|
|
251
260
|
display: `${(a.avgSuccessRate * 100).toFixed(0).padStart(3)}%`,
|
|
252
261
|
})), 1);
|
|
262
|
+
// Suite breakdown (aggregate across all reports that have suites)
|
|
263
|
+
const withSuites = reports.filter((r) => r.suites && r.suites.length > 0);
|
|
264
|
+
if (withSuites.length > 0) {
|
|
265
|
+
const suiteAgg = new Map();
|
|
266
|
+
for (const r of withSuites) {
|
|
267
|
+
for (const s of r.suites) {
|
|
268
|
+
const arr = suiteAgg.get(s.name) ?? [];
|
|
269
|
+
arr.push(s.successRate);
|
|
270
|
+
suiteAgg.set(s.name, arr);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
const suiteRows = [...suiteAgg.entries()]
|
|
274
|
+
.map(([name, rates]) => {
|
|
275
|
+
const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
|
|
276
|
+
return {
|
|
277
|
+
label: name,
|
|
278
|
+
value: avgRate,
|
|
279
|
+
display: `${(avgRate * 100).toFixed(0).padStart(3)}%`,
|
|
280
|
+
};
|
|
281
|
+
})
|
|
282
|
+
.sort((a, b) => b.value - a.value);
|
|
283
|
+
printSection("Suite Breakdown", suiteRows, 1);
|
|
284
|
+
}
|
|
253
285
|
// Token charts (only when data is present)
|
|
254
286
|
const withTokens = agg.filter((a) => a.avgInputTokens != null && a.avgOutputTokens != null);
|
|
255
287
|
if (withTokens.length > 0) {
|
package/dist/types.d.ts
CHANGED
|
@@ -14,6 +14,7 @@ export interface AgentResponse {
|
|
|
14
14
|
[key: string]: unknown;
|
|
15
15
|
};
|
|
16
16
|
}
|
|
17
|
+
export type HookFn = () => void | Promise<void>;
|
|
17
18
|
export interface SceneDefinition {
|
|
18
19
|
prompt: string;
|
|
19
20
|
assertions: Array<{
|
|
@@ -22,6 +23,8 @@ export interface SceneDefinition {
|
|
|
22
23
|
}>;
|
|
23
24
|
timeout?: number;
|
|
24
25
|
turns?: number;
|
|
26
|
+
runs?: number;
|
|
27
|
+
suite?: string;
|
|
25
28
|
}
|
|
26
29
|
export type JudgeVerdict = "pass" | "fail" | "partial";
|
|
27
30
|
export interface JudgeResult {
|
|
@@ -29,6 +32,13 @@ export interface JudgeResult {
|
|
|
29
32
|
reasoning: string;
|
|
30
33
|
criteria: string;
|
|
31
34
|
}
|
|
35
|
+
export interface RunResult {
|
|
36
|
+
passed: boolean;
|
|
37
|
+
error?: string;
|
|
38
|
+
response: AgentResponse;
|
|
39
|
+
duration: number;
|
|
40
|
+
judgement?: JudgeResult;
|
|
41
|
+
}
|
|
32
42
|
export interface SceneResult {
|
|
33
43
|
prompt: string;
|
|
34
44
|
response: AgentResponse;
|
|
@@ -36,6 +46,10 @@ export interface SceneResult {
|
|
|
36
46
|
passed: boolean;
|
|
37
47
|
error?: string;
|
|
38
48
|
judgement?: JudgeResult;
|
|
49
|
+
suite?: string;
|
|
50
|
+
runs?: RunResult[];
|
|
51
|
+
passRate?: number;
|
|
52
|
+
statisticalSignificance?: number;
|
|
39
53
|
}
|
|
40
54
|
export interface AgentReport {
|
|
41
55
|
name?: string;
|