@sebastiantuyu/agest 0.3.3-next.10 → 0.3.3-next.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +39 -0
- package/dist/cli.js +128 -8
- package/dist/context.d.ts +9 -0
- package/dist/context.js +115 -4
- package/dist/preview.js +5 -11
- package/dist/reporter.d.ts +13 -2
- package/dist/reporter.js +20 -22
- package/dist/reports.d.ts +28 -0
- package/dist/reports.js +151 -10
- package/dist/runner.js +2 -18
- package/dist/stats.js +53 -10
- package/dist/types.d.ts +37 -0
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
|
@@ -1,8 +1,26 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import type { CheckpointRecord } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* One record per `agent()` run, appended by the child process (see
|
|
5
|
+
* AgentContext.execute → AGEST_SUMMARY_FILE). The parent reads them all back to
|
|
6
|
+
* print a vitest-style footer across files.
|
|
7
|
+
*/
|
|
8
|
+
interface RunSummaryRecord {
|
|
9
|
+
file: string;
|
|
10
|
+
name?: string;
|
|
11
|
+
total: number;
|
|
12
|
+
passed: number;
|
|
13
|
+
failed: number;
|
|
14
|
+
duration: number;
|
|
15
|
+
costUsd: number | null;
|
|
16
|
+
/** Full checkpoint payload the parent appends to the canonical run log. */
|
|
17
|
+
checkpoint?: CheckpointRecord;
|
|
18
|
+
}
|
|
2
19
|
export interface ParsedRunArgs {
|
|
3
20
|
pattern?: string;
|
|
4
21
|
targets: string[];
|
|
5
22
|
full: boolean;
|
|
23
|
+
record: boolean;
|
|
6
24
|
}
|
|
7
25
|
/**
|
|
8
26
|
* Extract the args that follow the command word from a full `process.argv`.
|
|
@@ -13,4 +31,25 @@ export interface ParsedRunArgs {
|
|
|
13
31
|
*/
|
|
14
32
|
export declare function getCommandArgs(argv: string[]): string[];
|
|
15
33
|
export declare function parseRunArgs(args: string[]): ParsedRunArgs;
|
|
34
|
+
export interface RunSummary {
|
|
35
|
+
/** Whether the footer should print — false for a single scene in one file. */
|
|
36
|
+
show: boolean;
|
|
37
|
+
discoveredFiles: number;
|
|
38
|
+
filesPassed: number;
|
|
39
|
+
filesFailed: number;
|
|
40
|
+
totalCases: number;
|
|
41
|
+
casesPassed: number;
|
|
42
|
+
casesFailed: number;
|
|
43
|
+
duration: number;
|
|
44
|
+
cost: number;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Aggregate every child's records into the footer totals. The footer only
|
|
48
|
+
* shows when more than one case ran (multiple files, or one file with multiple
|
|
49
|
+
* scenes) — a single scene already prints its own one-line summary. A file
|
|
50
|
+
* counts as failed if any of its agent() runs had a failing case, or if it
|
|
51
|
+
* never wrote a record (crashed before reporting).
|
|
52
|
+
*/
|
|
53
|
+
export declare function aggregateRunSummary(records: RunSummaryRecord[], discoveredFiles: number): RunSummary;
|
|
16
54
|
export declare function main(argv: string[]): Promise<void>;
|
|
55
|
+
export {};
|
package/dist/cli.js
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { spawn } from "child_process";
|
|
3
3
|
import { fileURLToPath } from "node:url";
|
|
4
|
-
import { realpathSync } from "node:fs";
|
|
4
|
+
import { realpathSync, mkdtempSync, mkdirSync, readFileSync, appendFileSync, rmSync } from "node:fs";
|
|
5
|
+
import { randomUUID } from "node:crypto";
|
|
6
|
+
import { tmpdir } from "node:os";
|
|
7
|
+
import { join, dirname } from "node:path";
|
|
5
8
|
import { main as stats } from "./stats.js";
|
|
6
9
|
import { main as preview } from "./preview.js";
|
|
7
10
|
import { DEFAULT_PATTERN, discoverTestFiles } from "./discover.js";
|
|
11
|
+
import { c } from "./logger.js";
|
|
8
12
|
/**
|
|
9
13
|
* Extract the args that follow the command word from a full `process.argv`.
|
|
10
14
|
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
@@ -19,6 +23,7 @@ export function parseRunArgs(args) {
|
|
|
19
23
|
const targets = [];
|
|
20
24
|
let pattern;
|
|
21
25
|
let full = false;
|
|
26
|
+
let record = false;
|
|
22
27
|
for (let i = 0; i < args.length; i++) {
|
|
23
28
|
const a = args[i];
|
|
24
29
|
if (a === "--pattern" || a === "-p") {
|
|
@@ -34,34 +39,146 @@ export function parseRunArgs(args) {
|
|
|
34
39
|
else if (a === "--full") {
|
|
35
40
|
full = true;
|
|
36
41
|
}
|
|
42
|
+
else if (a === "--record") {
|
|
43
|
+
record = true;
|
|
44
|
+
}
|
|
37
45
|
else {
|
|
38
46
|
targets.push(a);
|
|
39
47
|
}
|
|
40
48
|
}
|
|
41
|
-
return { pattern, targets, full };
|
|
49
|
+
return { pattern, targets, full, record };
|
|
42
50
|
}
|
|
43
51
|
async function run(args) {
|
|
44
|
-
const { pattern, targets, full } = parseRunArgs(args);
|
|
52
|
+
const { pattern, targets, full, record } = parseRunArgs(args);
|
|
45
53
|
const files = await discoverTestFiles(targets, { pattern });
|
|
46
54
|
if (files.length === 0) {
|
|
47
55
|
const effective = pattern ?? DEFAULT_PATTERN;
|
|
48
56
|
console.error(` No test files found (pattern: ${effective})`);
|
|
49
57
|
process.exit(1);
|
|
50
58
|
}
|
|
59
|
+
// Each child appends a summary record here; the parent reads them back for
|
|
60
|
+
// the aggregate footer. A unique dir keeps concurrent `agest run`s isolated.
|
|
61
|
+
const summaryFile = join(mkdtempSync(join(tmpdir(), "agest-")), "summary.jsonl");
|
|
62
|
+
// One sweepId per invocation groups every checkpoint row from this run.
|
|
63
|
+
const sweepId = randomUUID();
|
|
64
|
+
const childEnv = {
|
|
65
|
+
...process.env,
|
|
66
|
+
AGEST_SUMMARY_FILE: summaryFile,
|
|
67
|
+
AGEST_SWEEP_ID: sweepId,
|
|
68
|
+
// The test file renders its own output in a child process; propagate
|
|
69
|
+
// --full so it emits the waterfall + full report rather than lean results.
|
|
70
|
+
...(full ? { AGEST_FULL: "1" } : {}),
|
|
71
|
+
// Opt-in: persist a full per-scene YAML snapshot per agent() execution.
|
|
72
|
+
...(record ? { AGEST_RECORD: "1" } : {}),
|
|
73
|
+
};
|
|
74
|
+
let anyChildCrashed = false;
|
|
75
|
+
// Run every file (vitest-style) instead of bailing on the first failure, so
|
|
76
|
+
// the footer reflects the whole run. Exit non-zero at the end if any failed.
|
|
51
77
|
for (const file of files) {
|
|
52
78
|
const child = spawn("npx", ["tsx", file], {
|
|
53
79
|
stdio: "inherit",
|
|
54
80
|
shell: true,
|
|
55
|
-
|
|
56
|
-
// --full flag through the environment so it knows to emit the waterfall
|
|
57
|
-
// and full report rather than just per-scene results.
|
|
58
|
-
env: full ? { ...process.env, AGEST_FULL: "1" } : process.env,
|
|
81
|
+
env: childEnv,
|
|
59
82
|
});
|
|
60
83
|
const code = await new Promise((resolve) => child.on("close", (c) => resolve(c ?? 1)));
|
|
84
|
+
// A non-zero code means the file itself threw/crashed. Failing scenes do
|
|
85
|
+
// NOT surface here — the child resolves cleanly — so failure is read back
|
|
86
|
+
// from the summary records below.
|
|
61
87
|
if (code !== 0)
|
|
62
|
-
|
|
88
|
+
anyChildCrashed = true;
|
|
89
|
+
}
|
|
90
|
+
const records = readSummary(summaryFile);
|
|
91
|
+
writeCheckpoints(records);
|
|
92
|
+
printRunSummary(records, files.length);
|
|
93
|
+
try {
|
|
94
|
+
rmSync(dirname(summaryFile), { recursive: true, force: true });
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
/* best-effort cleanup */
|
|
98
|
+
}
|
|
99
|
+
const casesFailed = records.reduce((sum, r) => sum + r.failed, 0);
|
|
100
|
+
if (anyChildCrashed || casesFailed > 0)
|
|
101
|
+
process.exit(1);
|
|
102
|
+
}
|
|
103
|
+
function readSummary(summaryFile) {
|
|
104
|
+
try {
|
|
105
|
+
return readFileSync(summaryFile, "utf8")
|
|
106
|
+
.split("\n")
|
|
107
|
+
.filter(Boolean)
|
|
108
|
+
.map((line) => JSON.parse(line));
|
|
109
|
+
}
|
|
110
|
+
catch {
|
|
111
|
+
return []; // no children wrote results (older lib, or all crashed early)
|
|
63
112
|
}
|
|
64
113
|
}
|
|
114
|
+
/**
|
|
115
|
+
* The parent is the single writer of the canonical run log: append every
|
|
116
|
+
* child's checkpoint record to `.reports/checkpoints.jsonl` in one buffer
|
|
117
|
+
* (race-free across the spawned children). Best-effort — never break a run.
|
|
118
|
+
*/
|
|
119
|
+
function writeCheckpoints(records) {
|
|
120
|
+
const checkpoints = records
|
|
121
|
+
.map((r) => r.checkpoint)
|
|
122
|
+
.filter((c) => c != null);
|
|
123
|
+
if (checkpoints.length === 0)
|
|
124
|
+
return;
|
|
125
|
+
try {
|
|
126
|
+
const dir = join(process.cwd(), ".reports");
|
|
127
|
+
mkdirSync(dir, { recursive: true });
|
|
128
|
+
const lines = checkpoints.map((c) => JSON.stringify(c)).join("\n") + "\n";
|
|
129
|
+
appendFileSync(join(dir, "checkpoints.jsonl"), lines, "utf8");
|
|
130
|
+
}
|
|
131
|
+
catch {
|
|
132
|
+
/* ignore */
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Aggregate every child's records into the footer totals. The footer only
|
|
137
|
+
* shows when more than one case ran (multiple files, or one file with multiple
|
|
138
|
+
* scenes) — a single scene already prints its own one-line summary. A file
|
|
139
|
+
* counts as failed if any of its agent() runs had a failing case, or if it
|
|
140
|
+
* never wrote a record (crashed before reporting).
|
|
141
|
+
*/
|
|
142
|
+
export function aggregateRunSummary(records, discoveredFiles) {
|
|
143
|
+
const totalCases = records.reduce((sum, r) => sum + r.total, 0);
|
|
144
|
+
const failsByFile = new Map();
|
|
145
|
+
for (const r of records) {
|
|
146
|
+
failsByFile.set(r.file, (failsByFile.get(r.file) ?? 0) + r.failed);
|
|
147
|
+
}
|
|
148
|
+
const missing = Math.max(0, discoveredFiles - failsByFile.size);
|
|
149
|
+
const filesFailed = [...failsByFile.values()].filter((f) => f > 0).length + missing;
|
|
150
|
+
const casesPassed = records.reduce((sum, r) => sum + r.passed, 0);
|
|
151
|
+
return {
|
|
152
|
+
show: records.length > 0 && (discoveredFiles > 1 || totalCases > 1),
|
|
153
|
+
discoveredFiles,
|
|
154
|
+
filesPassed: discoveredFiles - filesFailed,
|
|
155
|
+
filesFailed,
|
|
156
|
+
totalCases,
|
|
157
|
+
casesPassed,
|
|
158
|
+
casesFailed: totalCases - casesPassed,
|
|
159
|
+
duration: records.reduce((sum, r) => sum + (r.duration || 0), 0),
|
|
160
|
+
cost: records.reduce((sum, r) => sum + (r.costUsd ?? 0), 0),
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Print the vitest-style footer. Delegates the math to aggregateRunSummary and
|
|
165
|
+
* only renders when that says so.
|
|
166
|
+
*/
|
|
167
|
+
function printRunSummary(records, discoveredFiles) {
|
|
168
|
+
const s = aggregateRunSummary(records, discoveredFiles);
|
|
169
|
+
if (!s.show)
|
|
170
|
+
return;
|
|
171
|
+
const tally = (failed, passed, total) => failed > 0
|
|
172
|
+
? `${c.red(`${failed} failed`)} ${c.dim("|")} ${c.green(`${passed} passed`)} ${c.dim(`(${total})`)}`
|
|
173
|
+
: `${c.green(`${passed} passed`)} ${c.dim(`(${total})`)}`;
|
|
174
|
+
const line = (label, value) => console.log(`${c.dim(label.padStart(11))} ${value}`);
|
|
175
|
+
console.log("");
|
|
176
|
+
line("Test Files", tally(s.filesFailed, s.filesPassed, s.discoveredFiles));
|
|
177
|
+
line("Tests", tally(s.casesFailed, s.casesPassed, s.totalCases));
|
|
178
|
+
line("Duration", `${s.duration}ms`);
|
|
179
|
+
if (s.cost > 0)
|
|
180
|
+
line("Cost", c.green(`$${Number(s.cost.toFixed(4))}`));
|
|
181
|
+
}
|
|
65
182
|
function printUsage() {
|
|
66
183
|
console.log(`
|
|
67
184
|
Usage: agest <command>
|
|
@@ -72,7 +189,10 @@ function printUsage() {
|
|
|
72
189
|
agest run src/agest --pattern "**/*.test.ts"
|
|
73
190
|
agest run "tests/**/*.agest.ts" path/to/file.agest.ts
|
|
74
191
|
agest run tests/ --full # also print waterfall + full report
|
|
192
|
+
agest run tests/ --record # also save a full per-scene snapshot
|
|
75
193
|
stats Show aggregated test statistics
|
|
194
|
+
agest stats --suite <suiteHash> # filter to one suite's history
|
|
195
|
+
agest stats --export-csv [path] # flatten the run log to CSV
|
|
76
196
|
preview Generate an HTML report preview
|
|
77
197
|
`);
|
|
78
198
|
}
|
package/dist/context.d.ts
CHANGED
|
@@ -53,5 +53,14 @@ export declare class AgentContext<T = string> {
|
|
|
53
53
|
execute(): Promise<AgentReport<T>>;
|
|
54
54
|
}
|
|
55
55
|
export declare function hashPromptOnly(prompt: string): string;
|
|
56
|
+
/**
|
|
57
|
+
* Identity hash of the test suite: prompts, suite names, assertion field names +
|
|
58
|
+
* bodies (`fn.toString()`), and schema presence. Makes the suite a first-class
|
|
59
|
+
* dimension so a comparison can never silently span a changed set of scenes.
|
|
60
|
+
* `fn.toString()` is formatting/closure-sensitive — it over-segments (a cosmetic
|
|
61
|
+
* edit yields a new hash) but never silently merges two different suites, which
|
|
62
|
+
* is the safe direction.
|
|
63
|
+
*/
|
|
64
|
+
export declare function computeSuiteHash(definitions: SceneDefinition[]): string;
|
|
56
65
|
export declare function setContext(ctx: AgentContext<any> | null): void;
|
|
57
66
|
export declare function getContext(): AgentContext<any>;
|
package/dist/context.js
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
import { createHash } from "crypto";
|
|
1
|
+
import { createHash, randomUUID } from "crypto";
|
|
2
|
+
import { appendFileSync } from "node:fs";
|
|
3
|
+
import { relative } from "node:path";
|
|
2
4
|
import { executeScene } from "./runner";
|
|
3
5
|
import { resolveText } from "./resolve";
|
|
4
|
-
import { formatReport,
|
|
6
|
+
import { formatReport, writeSnapshot, appendCheckpoint, writeDiffEntry } from "./reporter";
|
|
7
|
+
import { wilsonInterval } from "./reports";
|
|
5
8
|
import { logger, c } from "./logger";
|
|
6
9
|
import { loadConfig } from "./config";
|
|
7
10
|
import { setPricingOverrides } from "./pricing";
|
|
@@ -239,6 +242,9 @@ export class AgentContext {
|
|
|
239
242
|
: undefined;
|
|
240
243
|
const firstMeta = results.find((r) => r.response.metadata)?.response
|
|
241
244
|
.metadata;
|
|
245
|
+
// Config identity. suiteHash + judge + runs complete the dimension set so
|
|
246
|
+
// comparisons never silently span a changed suite or sampling config.
|
|
247
|
+
// `temperature` is read opportunistically (open metadata map) when present.
|
|
242
248
|
const dimensions = {};
|
|
243
249
|
if (firstMeta?.model)
|
|
244
250
|
dimensions.model = firstMeta.model;
|
|
@@ -248,6 +254,28 @@ export class AgentContext {
|
|
|
248
254
|
dimensions.tools = [...firstMeta.tools].sort().join(",");
|
|
249
255
|
else
|
|
250
256
|
dimensions.tools = "none";
|
|
257
|
+
dimensions.suiteHash = computeSuiteHash(definitions);
|
|
258
|
+
dimensions.judge = config.judge?.model ?? "none";
|
|
259
|
+
dimensions.runs = String(config.runs ?? 1);
|
|
260
|
+
const temperature = firstMeta?.temperature;
|
|
261
|
+
if (temperature != null)
|
|
262
|
+
dimensions.temperature = String(temperature);
|
|
263
|
+
// Report-level statistical honesty. The trial basis is Σ runs across every
|
|
264
|
+
// scene (a multi-run scene contributes N trials), not just case count.
|
|
265
|
+
const casesPassed = results.filter((r) => r.passed).length;
|
|
266
|
+
let trials = 0;
|
|
267
|
+
let trialPasses = 0;
|
|
268
|
+
for (const r of results) {
|
|
269
|
+
if (r.runs && r.runs.length) {
|
|
270
|
+
trials += r.runs.length;
|
|
271
|
+
trialPasses += r.runs.filter((x) => x.passed).length;
|
|
272
|
+
}
|
|
273
|
+
else {
|
|
274
|
+
trials += 1;
|
|
275
|
+
trialPasses += r.passed ? 1 : 0;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
const wilson = wilsonInterval(trialPasses, trials);
|
|
251
279
|
const report = {
|
|
252
280
|
name: this._name,
|
|
253
281
|
model: firstMeta?.model,
|
|
@@ -265,6 +293,10 @@ export class AgentContext {
|
|
|
265
293
|
timestamp: new Date().toISOString(),
|
|
266
294
|
duration: Math.round(totalDuration),
|
|
267
295
|
totalCases: results.length,
|
|
296
|
+
casesPassed,
|
|
297
|
+
runsPerScene: config.runs ?? 1,
|
|
298
|
+
wilsonLow: wilson.low,
|
|
299
|
+
wilsonHigh: wilson.high,
|
|
268
300
|
averageInputTokensPerCase,
|
|
269
301
|
averageOutputTokensPerCase,
|
|
270
302
|
totalInputTokens,
|
|
@@ -286,8 +318,70 @@ export class AgentContext {
|
|
|
286
318
|
const costSummary = totalCostUsd != null ? ` ${c.dim("·")} ${c.green(`$${Number(totalCostUsd.toFixed(4))}`)}` : "";
|
|
287
319
|
logger.info(`${rateColor(`${passed}/${results.length} passed`)} ${c.dim(`(${(successRate * 100).toFixed(0)}%)`)} ${c.dim("·")} ${c.dim(`${Math.round(totalDuration)}ms`)}${costSummary}`);
|
|
288
320
|
}
|
|
289
|
-
|
|
290
|
-
|
|
321
|
+
// A unique runId per agent() execution names the optional snapshot (so
|
|
322
|
+
// snapshots never clobber) and tags the checkpoint record.
|
|
323
|
+
const sweepId = process.env.AGEST_SWEEP_ID;
|
|
324
|
+
const runId = `${sweepId ?? "local"}-${randomUUID().slice(0, 8)}`;
|
|
325
|
+
// Heavy per-scene snapshot is opt-in via --record (AGEST_RECORD).
|
|
326
|
+
let recordPath;
|
|
327
|
+
if (process.env.AGEST_RECORD === "1") {
|
|
328
|
+
const snapPath = await writeSnapshot(formatted, runId);
|
|
329
|
+
recordPath = relative(process.cwd(), snapPath);
|
|
330
|
+
logger.info(`${c.dim("Snapshot saved to:")} ${c.cyan(recordPath)}`);
|
|
331
|
+
}
|
|
332
|
+
const checkpoint = {
|
|
333
|
+
runId,
|
|
334
|
+
sweepId,
|
|
335
|
+
timestamp: report.timestamp,
|
|
336
|
+
agentName: this._name,
|
|
337
|
+
model: report.model,
|
|
338
|
+
systemPromptHash: report.systemPromptHash,
|
|
339
|
+
tools: report.tools,
|
|
340
|
+
dimensions,
|
|
341
|
+
runsPerScene: report.runsPerScene,
|
|
342
|
+
totalCases: report.totalCases,
|
|
343
|
+
casesPassed,
|
|
344
|
+
successRate,
|
|
345
|
+
wilsonLow: report.wilsonLow,
|
|
346
|
+
wilsonHigh: report.wilsonHigh,
|
|
347
|
+
durationMs: Math.round(totalDuration),
|
|
348
|
+
costUsd: totalCostUsd ?? null,
|
|
349
|
+
totalInputTokens,
|
|
350
|
+
totalOutputTokens,
|
|
351
|
+
avgInputTokensPerCase: averageInputTokensPerCase,
|
|
352
|
+
avgOutputTokensPerCase: averageOutputTokensPerCase,
|
|
353
|
+
recordPath,
|
|
354
|
+
};
|
|
355
|
+
// When launched by `agest run`, hand the record to the parent (single writer
|
|
356
|
+
// of the checkpoint log) and let it print the cross-file footer. Standalone
|
|
357
|
+
// (`tsx foo.agest.ts`), this process is the lone writer — append directly.
|
|
358
|
+
// Best-effort throughout: never let persistence break a run.
|
|
359
|
+
const summaryFile = process.env.AGEST_SUMMARY_FILE;
|
|
360
|
+
if (summaryFile) {
|
|
361
|
+
try {
|
|
362
|
+
appendFileSync(summaryFile, JSON.stringify({
|
|
363
|
+
file: process.argv[1],
|
|
364
|
+
name: this._name,
|
|
365
|
+
total: results.length,
|
|
366
|
+
passed: casesPassed,
|
|
367
|
+
failed: results.length - casesPassed,
|
|
368
|
+
duration: Math.round(totalDuration),
|
|
369
|
+
costUsd: totalCostUsd ?? null,
|
|
370
|
+
checkpoint,
|
|
371
|
+
}) + "\n");
|
|
372
|
+
}
|
|
373
|
+
catch {
|
|
374
|
+
/* ignore */
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
else {
|
|
378
|
+
try {
|
|
379
|
+
await appendCheckpoint(checkpoint);
|
|
380
|
+
}
|
|
381
|
+
catch {
|
|
382
|
+
/* ignore */
|
|
383
|
+
}
|
|
384
|
+
}
|
|
291
385
|
return report;
|
|
292
386
|
}
|
|
293
387
|
}
|
|
@@ -298,6 +392,23 @@ function hashPrompt(prompt, model) {
|
|
|
298
392
|
export function hashPromptOnly(prompt) {
|
|
299
393
|
return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
|
|
300
394
|
}
|
|
395
|
+
/**
|
|
396
|
+
* Identity hash of the test suite: prompts, suite names, assertion field names +
|
|
397
|
+
* bodies (`fn.toString()`), and schema presence. Makes the suite a first-class
|
|
398
|
+
* dimension so a comparison can never silently span a changed set of scenes.
|
|
399
|
+
* `fn.toString()` is formatting/closure-sensitive — it over-segments (a cosmetic
|
|
400
|
+
* edit yields a new hash) but never silently merges two different suites, which
|
|
401
|
+
* is the safe direction.
|
|
402
|
+
*/
|
|
403
|
+
export function computeSuiteHash(definitions) {
|
|
404
|
+
const canonical = definitions.map((d) => ({
|
|
405
|
+
prompt: d.prompt,
|
|
406
|
+
suite: d.suite ?? null,
|
|
407
|
+
schema: d.schema ? "1" : "0",
|
|
408
|
+
assertions: d.assertions.map((a) => ({ field: a.field, fn: a.fn.toString() })),
|
|
409
|
+
}));
|
|
410
|
+
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex").slice(0, 12);
|
|
411
|
+
}
|
|
301
412
|
// The active context is a runtime singleton holding an executor of arbitrary
|
|
302
413
|
// value type, so `any` is the honest type for the holder. The generic flows
|
|
303
414
|
// through `agent()` → `AgentContext<T>` → the report at the call site.
|
package/dist/preview.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { join
|
|
1
|
+
import { writeFile } from "fs/promises";
|
|
2
|
+
import { join } from "path";
|
|
3
3
|
import os from "os";
|
|
4
4
|
import { exec } from "child_process";
|
|
5
|
-
import {
|
|
5
|
+
import { loadReports, loadDiffEntry, wilsonLowerBound, computeDiff, formatDuration, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
|
|
6
6
|
// ---------------------------------------------------------------------------
|
|
7
7
|
// Helpers
|
|
8
8
|
// ---------------------------------------------------------------------------
|
|
@@ -1313,17 +1313,11 @@ function generateHTML(groups, totalReports) {
|
|
|
1313
1313
|
// ---------------------------------------------------------------------------
|
|
1314
1314
|
async function main() {
|
|
1315
1315
|
const cwd = process.cwd();
|
|
1316
|
-
const
|
|
1317
|
-
if (
|
|
1316
|
+
const reports = await loadReports(cwd);
|
|
1317
|
+
if (reports.length === 0) {
|
|
1318
1318
|
console.log("\n No reports found. Run some agent tests first.\n");
|
|
1319
1319
|
return;
|
|
1320
1320
|
}
|
|
1321
|
-
const reports = await Promise.all(files.map(async (f) => {
|
|
1322
|
-
const content = await readFile(f, "utf-8");
|
|
1323
|
-
return parseReport(content, relative(cwd, f));
|
|
1324
|
-
}));
|
|
1325
|
-
// Ensure all reports have dimensions (backward compat)
|
|
1326
|
-
await Promise.all(reports.map((r) => ensureDimensions(r)));
|
|
1327
1321
|
// Group by agent name, sort each group oldest -> newest
|
|
1328
1322
|
const groupMap = new Map();
|
|
1329
1323
|
for (const r of reports) {
|
package/dist/reporter.d.ts
CHANGED
|
@@ -1,4 +1,15 @@
|
|
|
1
|
-
import type { AgentReport } from "./types";
|
|
1
|
+
import type { AgentReport, CheckpointRecord } from "./types";
|
|
2
2
|
export declare function formatReport(report: AgentReport<unknown>): string;
|
|
3
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Write a full per-scene YAML report under `.reports/runs/<runId>.yaml`. The
|
|
5
|
+
* runId is unique per `agent()` execution, so snapshots never clobber and need
|
|
6
|
+
* no locking. Only written when a run opts in via `--record`.
|
|
7
|
+
*/
|
|
8
|
+
export declare function writeSnapshot(content: string, runId: string): Promise<string>;
|
|
9
|
+
/**
|
|
10
|
+
* Append one record to the canonical append-only run log
|
|
11
|
+
* (`.reports/checkpoints.jsonl`). Used on the standalone path (a lone test-file
|
|
12
|
+
* process). When launched by `agest run`, the PARENT owns this write instead.
|
|
13
|
+
*/
|
|
14
|
+
export declare function appendCheckpoint(record: CheckpointRecord): Promise<void>;
|
|
4
15
|
export declare function writeDiffEntry(hash: string, systemPrompt: string, tools: string[], model?: string): Promise<void>;
|
package/dist/reporter.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import { access, mkdir, writeFile } from "fs/promises";
|
|
2
|
-
import { createHash } from "crypto";
|
|
1
|
+
import { access, appendFile, mkdir, writeFile } from "fs/promises";
|
|
3
2
|
import { join } from "path";
|
|
4
3
|
import { resolveText } from "./resolve";
|
|
5
4
|
export function formatReport(report) {
|
|
@@ -153,29 +152,28 @@ function formatUsd(n) {
|
|
|
153
152
|
function escapeYaml(s) {
|
|
154
153
|
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
|
|
155
154
|
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
}
|
|
166
|
-
else {
|
|
167
|
-
const safestamp = timestamp.replace(/[:.]/g, "-");
|
|
168
|
-
filename = `report${safename}-${safestamp}.yaml`;
|
|
169
|
-
}
|
|
170
|
-
const filepath = join(reportsDir, filename);
|
|
171
|
-
try {
|
|
172
|
-
await access(filepath);
|
|
173
|
-
console.warn(`\x1b[33m⚠ Overwriting previous report for ${name ?? "unnamed"} (same config)\x1b[0m`);
|
|
174
|
-
}
|
|
175
|
-
catch { }
|
|
155
|
+
/**
|
|
156
|
+
* Write a full per-scene YAML report under `.reports/runs/<runId>.yaml`. The
|
|
157
|
+
* runId is unique per `agent()` execution, so snapshots never clobber and need
|
|
158
|
+
* no locking. Only written when a run opts in via `--record`.
|
|
159
|
+
*/
|
|
160
|
+
export async function writeSnapshot(content, runId) {
|
|
161
|
+
const runsDir = join(process.cwd(), ".reports", "runs");
|
|
162
|
+
await mkdir(runsDir, { recursive: true });
|
|
163
|
+
const filepath = join(runsDir, `${runId}.yaml`);
|
|
176
164
|
await writeFile(filepath, content, "utf-8");
|
|
177
165
|
return filepath;
|
|
178
166
|
}
|
|
167
|
+
/**
|
|
168
|
+
* Append one record to the canonical append-only run log
|
|
169
|
+
* (`.reports/checkpoints.jsonl`). Used on the standalone path (a lone test-file
|
|
170
|
+
* process). When launched by `agest run`, the PARENT owns this write instead.
|
|
171
|
+
*/
|
|
172
|
+
export async function appendCheckpoint(record) {
|
|
173
|
+
const reportsDir = join(process.cwd(), ".reports");
|
|
174
|
+
await mkdir(reportsDir, { recursive: true });
|
|
175
|
+
await appendFile(join(reportsDir, "checkpoints.jsonl"), JSON.stringify(record) + "\n", "utf-8");
|
|
176
|
+
}
|
|
179
177
|
export async function writeDiffEntry(hash, systemPrompt, tools, model) {
|
|
180
178
|
const diffDir = join(process.cwd(), ".diff");
|
|
181
179
|
await mkdir(diffDir, { recursive: true });
|
package/dist/reports.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { CheckpointRecord } from "./types";
|
|
1
2
|
export interface ParsedSuiteResult {
|
|
2
3
|
name: string;
|
|
3
4
|
successRate: number;
|
|
@@ -127,9 +128,36 @@ export declare function findVaryingDimensions(reports: ParsedReport[]): string[]
|
|
|
127
128
|
* Group reports by the value of a specific dimension.
|
|
128
129
|
*/
|
|
129
130
|
export declare function groupByDimension(reports: ParsedReport[], dimension: string): Map<string, ParsedReport[]>;
|
|
131
|
+
/** Wilson interval from a pass count over a trial count. */
|
|
132
|
+
export declare function wilsonInterval(passes: number, total: number): {
|
|
133
|
+
low: number;
|
|
134
|
+
high: number;
|
|
135
|
+
};
|
|
130
136
|
/**
|
|
131
137
|
* Wilson score interval lower bound at 95% confidence.
|
|
132
138
|
* Gives a conservative success rate estimate that accounts for sample size.
|
|
133
139
|
*/
|
|
134
140
|
export declare function wilsonLowerBound(successRate: number, totalCases: number): number;
|
|
141
|
+
/** Walk for `.reports/checkpoints.jsonl` files, depth-limited like findReports. */
|
|
142
|
+
export declare function findCheckpointFiles(dir: string, depth?: number): Promise<string[]>;
|
|
143
|
+
/**
|
|
144
|
+
* Read every checkpoint record from the log(s) under `cwd`. Malformed lines
|
|
145
|
+
* (e.g. a crash mid-append) are skipped defensively rather than failing the
|
|
146
|
+
* whole read.
|
|
147
|
+
*/
|
|
148
|
+
export declare function readCheckpoints(cwd: string): Promise<CheckpointRecord[]>;
|
|
149
|
+
/**
|
|
150
|
+
* Adapt a checkpoint record to the ParsedReport shape the stats/preview
|
|
151
|
+
* renderers consume. When the record references a `--record` snapshot, the
|
|
152
|
+
* per-scene detail (scenes / suites / failed cases) is loaded lazily from it;
|
|
153
|
+
* otherwise the lightweight (record-only) view is returned.
|
|
154
|
+
*/
|
|
155
|
+
export declare function checkpointToReport(rec: CheckpointRecord): Promise<ParsedReport>;
|
|
156
|
+
/**
|
|
157
|
+
* Load all runs as ParsedReports: the canonical checkpoint log (primary) plus
|
|
158
|
+
* any legacy `report-*.yaml` still on disk (backward compat). Snapshots under
|
|
159
|
+
* `.reports/runs/` are NOT scanned directly — they are reached via a record's
|
|
160
|
+
* recordPath, so they never double-count.
|
|
161
|
+
*/
|
|
162
|
+
export declare function loadReports(cwd: string): Promise<ParsedReport[]>;
|
|
135
163
|
export declare function formatDuration(ms: number): string;
|
package/dist/reports.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { createHash } from "crypto";
|
|
2
2
|
import { readdir, readFile } from "fs/promises";
|
|
3
|
-
import { join } from "path";
|
|
3
|
+
import { join, relative } from "path";
|
|
4
4
|
export function extractField(content, key) {
|
|
5
5
|
const regex = new RegExp(`^ ${key}:\\s*(.+)$`, "m");
|
|
6
6
|
const match = content.match(regex);
|
|
@@ -474,20 +474,161 @@ export function groupByDimension(reports, dimension) {
|
|
|
474
474
|
}
|
|
475
475
|
return groups;
|
|
476
476
|
}
|
|
477
|
+
/**
|
|
478
|
+
* Wilson score interval bounds at 95% confidence for an observed rate `p` over
|
|
479
|
+
* `total` trials. The single source of truth for Wilson math — both the
|
|
480
|
+
* lower-bound helper and the runner's per-scene significance delegate here.
|
|
481
|
+
*/
|
|
482
|
+
function wilsonBounds(p, total) {
|
|
483
|
+
if (total === 0)
|
|
484
|
+
return { low: 0, high: 0 };
|
|
485
|
+
const z = 1.96;
|
|
486
|
+
const denominator = 1 + (z * z) / total;
|
|
487
|
+
const centre = p + (z * z) / (2 * total);
|
|
488
|
+
const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * total)) / total);
|
|
489
|
+
const clamp = (x) => Math.max(0, Math.min(1, x));
|
|
490
|
+
return {
|
|
491
|
+
low: clamp((centre - spread) / denominator),
|
|
492
|
+
high: clamp((centre + spread) / denominator),
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
/** Wilson interval from a pass count over a trial count. */
|
|
496
|
+
export function wilsonInterval(passes, total) {
|
|
497
|
+
return wilsonBounds(total === 0 ? 0 : passes / total, total);
|
|
498
|
+
}
|
|
477
499
|
/**
|
|
478
500
|
* Wilson score interval lower bound at 95% confidence.
|
|
479
501
|
* Gives a conservative success rate estimate that accounts for sample size.
|
|
480
502
|
*/
|
|
481
503
|
export function wilsonLowerBound(successRate, totalCases) {
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
504
|
+
return wilsonBounds(successRate, totalCases).low;
|
|
505
|
+
}
|
|
506
|
+
// ---------------------------------------------------------------------------
|
|
507
|
+
// Checkpoint log (canonical run store: .reports/checkpoints.jsonl)
|
|
508
|
+
// ---------------------------------------------------------------------------
|
|
509
|
+
/** Walk for `.reports/checkpoints.jsonl` files, depth-limited like findReports. */
|
|
510
|
+
export async function findCheckpointFiles(dir, depth = 0) {
|
|
511
|
+
if (depth > 6)
|
|
512
|
+
return [];
|
|
513
|
+
const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
|
|
514
|
+
const out = [];
|
|
515
|
+
let entries;
|
|
516
|
+
try {
|
|
517
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
518
|
+
}
|
|
519
|
+
catch {
|
|
520
|
+
return [];
|
|
521
|
+
}
|
|
522
|
+
for (const entry of entries) {
|
|
523
|
+
if (SKIP.has(entry.name))
|
|
524
|
+
continue;
|
|
525
|
+
const full = join(dir, entry.name);
|
|
526
|
+
if (entry.isDirectory()) {
|
|
527
|
+
if (entry.name === ".reports") {
|
|
528
|
+
const files = await readdir(full);
|
|
529
|
+
if (files.includes("checkpoints.jsonl")) {
|
|
530
|
+
out.push(join(full, "checkpoints.jsonl"));
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
else if (!entry.name.startsWith(".")) {
|
|
534
|
+
out.push(...(await findCheckpointFiles(full, depth + 1)));
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
return out;
|
|
539
|
+
}
|
|
540
|
+
/**
|
|
541
|
+
* Read every checkpoint record from the log(s) under `cwd`. Malformed lines
|
|
542
|
+
* (e.g. a crash mid-append) are skipped defensively rather than failing the
|
|
543
|
+
* whole read.
|
|
544
|
+
*/
|
|
545
|
+
export async function readCheckpoints(cwd) {
|
|
546
|
+
const files = await findCheckpointFiles(cwd);
|
|
547
|
+
const all = [];
|
|
548
|
+
for (const f of files) {
|
|
549
|
+
let content;
|
|
550
|
+
try {
|
|
551
|
+
content = await readFile(f, "utf-8");
|
|
552
|
+
}
|
|
553
|
+
catch {
|
|
554
|
+
continue;
|
|
555
|
+
}
|
|
556
|
+
for (const line of content.split("\n")) {
|
|
557
|
+
const trimmed = line.trim();
|
|
558
|
+
if (!trimmed)
|
|
559
|
+
continue;
|
|
560
|
+
try {
|
|
561
|
+
all.push(JSON.parse(trimmed));
|
|
562
|
+
}
|
|
563
|
+
catch {
|
|
564
|
+
/* skip malformed line */
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
return all;
|
|
569
|
+
}
|
|
570
|
+
/**
|
|
571
|
+
* Adapt a checkpoint record to the ParsedReport shape the stats/preview
|
|
572
|
+
* renderers consume. When the record references a `--record` snapshot, the
|
|
573
|
+
* per-scene detail (scenes / suites / failed cases) is loaded lazily from it;
|
|
574
|
+
* otherwise the lightweight (record-only) view is returned.
|
|
575
|
+
*/
|
|
576
|
+
export async function checkpointToReport(rec) {
|
|
577
|
+
let scenes;
|
|
578
|
+
let suites;
|
|
579
|
+
let failedCases = [];
|
|
580
|
+
if (rec.recordPath) {
|
|
581
|
+
try {
|
|
582
|
+
const content = await readFile(join(process.cwd(), rec.recordPath), "utf-8");
|
|
583
|
+
const snap = parseReport(content, rec.recordPath);
|
|
584
|
+
scenes = snap.scenes;
|
|
585
|
+
suites = snap.suites;
|
|
586
|
+
failedCases = snap.failedCases;
|
|
587
|
+
}
|
|
588
|
+
catch {
|
|
589
|
+
/* snapshot missing — fall back to the lightweight view */
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
return {
|
|
593
|
+
name: rec.agentName,
|
|
594
|
+
systemPromptHash: rec.systemPromptHash,
|
|
595
|
+
promptHash: rec.dimensions?.prompt,
|
|
596
|
+
dimensions: rec.dimensions,
|
|
597
|
+
tools: rec.tools,
|
|
598
|
+
model: rec.dimensions?.model ?? rec.model ?? "unknown",
|
|
599
|
+
successRate: rec.successRate,
|
|
600
|
+
totalCases: rec.totalCases,
|
|
601
|
+
failedCasesCount: Math.max(0, rec.totalCases - rec.casesPassed),
|
|
602
|
+
failedCases,
|
|
603
|
+
duration: rec.durationMs,
|
|
604
|
+
timestamp: rec.timestamp,
|
|
605
|
+
averageInputTokensPerCase: rec.avgInputTokensPerCase,
|
|
606
|
+
averageOutputTokensPerCase: rec.avgOutputTokensPerCase,
|
|
607
|
+
totalInputTokens: rec.totalInputTokens,
|
|
608
|
+
totalOutputTokens: rec.totalOutputTokens,
|
|
609
|
+
totalCostUsd: rec.costUsd ?? undefined,
|
|
610
|
+
scenes,
|
|
611
|
+
suites,
|
|
612
|
+
source: rec.recordPath ?? rec.runId,
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
/**
|
|
616
|
+
* Load all runs as ParsedReports: the canonical checkpoint log (primary) plus
|
|
617
|
+
* any legacy `report-*.yaml` still on disk (backward compat). Snapshots under
|
|
618
|
+
* `.reports/runs/` are NOT scanned directly — they are reached via a record's
|
|
619
|
+
* recordPath, so they never double-count.
|
|
620
|
+
*/
|
|
621
|
+
export async function loadReports(cwd) {
|
|
622
|
+
const records = await readCheckpoints(cwd);
|
|
623
|
+
const fromCheckpoints = await Promise.all(records.map((r) => checkpointToReport(r)));
|
|
624
|
+
const legacyFiles = await findReports(cwd);
|
|
625
|
+
const legacy = await Promise.all(legacyFiles.map(async (f) => {
|
|
626
|
+
const content = await readFile(f, "utf-8");
|
|
627
|
+
const r = parseReport(content, relative(cwd, f));
|
|
628
|
+
await ensureDimensions(r);
|
|
629
|
+
return r;
|
|
630
|
+
}));
|
|
631
|
+
return [...fromCheckpoints, ...legacy];
|
|
491
632
|
}
|
|
492
633
|
export function formatDuration(ms) {
|
|
493
634
|
if (ms < 1000)
|
package/dist/runner.js
CHANGED
|
@@ -2,6 +2,7 @@ import { collectPendingJudgements } from "./assertions";
|
|
|
2
2
|
import { callJudge, resolveJudgeExecutor } from "./judge";
|
|
3
3
|
import { resolveValue, resolveText, serializeValue, navigatePath } from "./resolve";
|
|
4
4
|
import { validateAgainstSchema } from "./schema";
|
|
5
|
+
import { wilsonInterval } from "./reports";
|
|
5
6
|
const DEFAULT_SCENE_TIMEOUT = 10_000;
|
|
6
7
|
/**
|
|
7
8
|
* Extract a named field from an agent response for assertion.
|
|
@@ -31,23 +32,6 @@ export function extractField(response, field) {
|
|
|
31
32
|
}
|
|
32
33
|
}
|
|
33
34
|
}
|
|
34
|
-
/**
|
|
35
|
-
* Compute Wilson score interval lower bound.
|
|
36
|
-
* Measures confidence that the true pass rate is above 50% (random chance).
|
|
37
|
-
* z = 1.96 for 95% confidence level.
|
|
38
|
-
*/
|
|
39
|
-
function wilsonSignificance(passes, total) {
|
|
40
|
-
if (total === 0)
|
|
41
|
-
return 0;
|
|
42
|
-
const z = 1.96;
|
|
43
|
-
const p = passes / total;
|
|
44
|
-
const denominator = 1 + (z * z) / total;
|
|
45
|
-
const centre = p + (z * z) / (2 * total);
|
|
46
|
-
const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * total)) / total);
|
|
47
|
-
const lower = (centre - spread) / denominator;
|
|
48
|
-
// Return the lower bound clamped to [0, 1]
|
|
49
|
-
return Math.max(0, Math.min(1, lower));
|
|
50
|
-
}
|
|
51
35
|
async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
|
|
52
36
|
// The empty sentinel uses the `text` branch of the union so it is a valid
|
|
53
37
|
// AgentResponse<T> for ANY T (there is no native value yet — the executor
|
|
@@ -178,7 +162,7 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
178
162
|
const passes = runs.filter((r) => r.passed).length;
|
|
179
163
|
const passRate = passes / runs.length;
|
|
180
164
|
const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
|
|
181
|
-
const statisticalSignificance =
|
|
165
|
+
const statisticalSignificance = wilsonInterval(passes, runs.length).low;
|
|
182
166
|
// Use the last run's response as representative
|
|
183
167
|
const lastRun = runs[runs.length - 1];
|
|
184
168
|
// Overall pass = majority passed (> 50%)
|
package/dist/stats.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { readdir,
|
|
1
|
+
import { readdir, writeFile, rm } from "fs/promises";
|
|
2
2
|
import { join, relative } from "path";
|
|
3
|
-
import {
|
|
3
|
+
import { loadReports, readCheckpoints, loadDiffEntry, computeDiff, formatDuration, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
|
|
4
4
|
function avg(nums) {
|
|
5
5
|
return nums.length === 0
|
|
6
6
|
? undefined
|
|
@@ -193,28 +193,71 @@ async function purge(cwd) {
|
|
|
193
193
|
// ---------------------------------------------------------------------------
|
|
194
194
|
// Main
|
|
195
195
|
// ---------------------------------------------------------------------------
|
|
196
|
+
/** Quote a CSV cell per RFC-4180 when it contains a comma, quote, or newline. */
|
|
197
|
+
function csvCell(v) {
|
|
198
|
+
const s = v == null ? "" : String(v);
|
|
199
|
+
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Flatten the JSONL run log to a CSV projection on demand — the only place CSV
|
|
203
|
+
* escaping lives. For spreadsheet/eyeball use; the JSONL log stays canonical.
|
|
204
|
+
*/
|
|
205
|
+
async function exportCsv(cwd, outPath) {
|
|
206
|
+
const records = await readCheckpoints(cwd);
|
|
207
|
+
if (records.length === 0) {
|
|
208
|
+
console.log("\n No checkpoints to export. Run some agent tests first.\n");
|
|
209
|
+
return;
|
|
210
|
+
}
|
|
211
|
+
const header = [
|
|
212
|
+
"runId", "sweepId", "timestamp", "agentName", "suiteHash", "model", "promptHash",
|
|
213
|
+
"tools", "judge", "runs", "runsPerScene", "totalCases", "casesPassed", "successRate",
|
|
214
|
+
"wilsonLow", "wilsonHigh", "durationMs", "costUsd", "totalInputTokens",
|
|
215
|
+
"totalOutputTokens", "recordPath",
|
|
216
|
+
];
|
|
217
|
+
const rows = records.map((r) => [
|
|
218
|
+
r.runId, r.sweepId, r.timestamp, r.agentName,
|
|
219
|
+
r.dimensions?.suiteHash, r.dimensions?.model, r.dimensions?.prompt,
|
|
220
|
+
Array.isArray(r.tools) ? r.tools.join("|") : r.dimensions?.tools,
|
|
221
|
+
r.dimensions?.judge, r.dimensions?.runs, r.runsPerScene, r.totalCases,
|
|
222
|
+
r.casesPassed, r.successRate, r.wilsonLow, r.wilsonHigh, r.durationMs,
|
|
223
|
+
r.costUsd, r.totalInputTokens, r.totalOutputTokens, r.recordPath,
|
|
224
|
+
].map(csvCell).join(","));
|
|
225
|
+
const csv = [header.join(","), ...rows].join("\n") + "\n";
|
|
226
|
+
await writeFile(join(cwd, outPath), csv, "utf-8");
|
|
227
|
+
console.log(`\n Exported ${records.length} checkpoint${records.length !== 1 ? "s" : ""} to ${outPath}\n`);
|
|
228
|
+
}
|
|
196
229
|
async function main() {
|
|
197
230
|
const args = process.argv.slice(2);
|
|
198
231
|
const agentFlagIdx = args.indexOf("--agent");
|
|
199
232
|
const agentFilter = agentFlagIdx !== -1 ? args[agentFlagIdx + 1] : undefined;
|
|
200
233
|
const modelFlagIdx = args.indexOf("--model");
|
|
201
234
|
const modelFilter = modelFlagIdx !== -1 ? args[modelFlagIdx + 1] : undefined;
|
|
235
|
+
const suiteFlagIdx = args.indexOf("--suite");
|
|
236
|
+
const suiteFilter = suiteFlagIdx !== -1 ? args[suiteFlagIdx + 1] : undefined;
|
|
202
237
|
if (args.includes("--purge")) {
|
|
203
238
|
await purge(process.cwd());
|
|
204
239
|
return;
|
|
205
240
|
}
|
|
206
241
|
const cwd = process.cwd();
|
|
207
|
-
const
|
|
208
|
-
if (
|
|
242
|
+
const csvFlagIdx = args.indexOf("--export-csv");
|
|
243
|
+
if (csvFlagIdx !== -1) {
|
|
244
|
+
const next = args[csvFlagIdx + 1];
|
|
245
|
+
const outPath = next && !next.startsWith("--") ? next : "agest-checkpoints.csv";
|
|
246
|
+
await exportCsv(cwd, outPath);
|
|
247
|
+
return;
|
|
248
|
+
}
|
|
249
|
+
let reports = await loadReports(cwd);
|
|
250
|
+
if (reports.length === 0) {
|
|
209
251
|
console.log("\n No reports found. Run some agent tests first.\n");
|
|
210
252
|
return;
|
|
211
253
|
}
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
254
|
+
if (suiteFilter) {
|
|
255
|
+
reports = reports.filter((r) => r.dimensions?.suiteHash === suiteFilter);
|
|
256
|
+
if (reports.length === 0) {
|
|
257
|
+
console.log(`\n No reports found for suite "${suiteFilter}".\n`);
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
218
261
|
if (agentFilter) {
|
|
219
262
|
reports = reports.filter((r) => r.name?.toLowerCase() === agentFilter.toLowerCase());
|
|
220
263
|
if (reports.length === 0) {
|
package/dist/types.d.ts
CHANGED
|
@@ -135,6 +135,13 @@ export interface AgentReport<T = string> {
|
|
|
135
135
|
timestamp: string;
|
|
136
136
|
duration: number;
|
|
137
137
|
totalCases: number;
|
|
138
|
+
/** Cases that passed (totalCases - failures). Persisted for statistical honesty. */
|
|
139
|
+
casesPassed?: number;
|
|
140
|
+
/** Configured runs per scene (sampling count) — affects the trial basis below. */
|
|
141
|
+
runsPerScene?: number;
|
|
142
|
+
/** Wilson score interval (95%) across all trials = Σ scene runs. */
|
|
143
|
+
wilsonLow?: number;
|
|
144
|
+
wilsonHigh?: number;
|
|
138
145
|
averageInputTokensPerCase?: number;
|
|
139
146
|
averageOutputTokensPerCase?: number;
|
|
140
147
|
totalInputTokens?: number;
|
|
@@ -142,4 +149,34 @@ export interface AgentReport<T = string> {
|
|
|
142
149
|
totalCostUsd?: number;
|
|
143
150
|
results: SceneResult<T>[];
|
|
144
151
|
}
|
|
152
|
+
/**
|
|
153
|
+
* One append-only line in `.reports/checkpoints.jsonl` — the canonical run log.
|
|
154
|
+
* Lightweight (cost + identity + stats), written on every run. Structured fields
|
|
155
|
+
* (`dimensions`, `tools`) stay native; adding a field later needs no migration.
|
|
156
|
+
*/
|
|
157
|
+
export interface CheckpointRecord {
|
|
158
|
+
runId: string;
|
|
159
|
+
sweepId?: string;
|
|
160
|
+
timestamp: string;
|
|
161
|
+
agentName?: string;
|
|
162
|
+
model?: string;
|
|
163
|
+
systemPromptHash?: string;
|
|
164
|
+
tools?: string[];
|
|
165
|
+
/** Config identity map: { suiteHash, model, prompt, tools, judge, runs }. */
|
|
166
|
+
dimensions: Record<string, string>;
|
|
167
|
+
runsPerScene?: number;
|
|
168
|
+
totalCases: number;
|
|
169
|
+
casesPassed: number;
|
|
170
|
+
successRate: number;
|
|
171
|
+
wilsonLow?: number;
|
|
172
|
+
wilsonHigh?: number;
|
|
173
|
+
durationMs: number;
|
|
174
|
+
costUsd?: number | null;
|
|
175
|
+
totalInputTokens?: number;
|
|
176
|
+
totalOutputTokens?: number;
|
|
177
|
+
avgInputTokensPerCase?: number;
|
|
178
|
+
avgOutputTokensPerCase?: number;
|
|
179
|
+
/** Relative path to the full YAML snapshot, set only when run with --record. */
|
|
180
|
+
recordPath?: string;
|
|
181
|
+
}
|
|
145
182
|
export {};
|