@sebastiantuyu/agest 0.1.6 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/adapters/index.d.ts +2 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/remote.d.ts +58 -0
- package/dist/adapters/remote.js +127 -0
- package/dist/assertions.d.ts +7 -0
- package/dist/assertions.js +9 -0
- package/dist/config.d.ts +19 -0
- package/dist/config.js +19 -0
- package/dist/context.d.ts +7 -1
- package/dist/context.js +60 -18
- package/dist/index.d.ts +8 -2
- package/dist/index.js +3 -2
- package/dist/judge.d.ts +9 -0
- package/dist/judge.js +101 -0
- package/dist/preview.d.ts +1 -0
- package/dist/preview.js +777 -0
- package/dist/reporter.d.ts +2 -1
- package/dist/reporter.js +49 -14
- package/dist/reports.d.ts +78 -0
- package/dist/reports.js +278 -0
- package/dist/runner.d.ts +2 -1
- package/dist/runner.js +46 -4
- package/dist/stats.js +222 -65
- package/dist/types.d.ts +12 -0
- package/package.json +12 -3
package/dist/stats.js
CHANGED
|
@@ -1,59 +1,6 @@
|
|
|
1
|
-
import { readdir, readFile } from "fs/promises";
|
|
1
|
+
import { readdir, readFile, rm } from "fs/promises";
|
|
2
2
|
import { join, relative } from "path";
|
|
3
|
-
|
|
4
|
-
const regex = new RegExp(`^ ${key}:\\s*(.+)$`, "m");
|
|
5
|
-
const match = content.match(regex);
|
|
6
|
-
if (!match)
|
|
7
|
-
return undefined;
|
|
8
|
-
return match[1].replace(/^"|"$/g, "").trim();
|
|
9
|
-
}
|
|
10
|
-
function parseReport(content, source) {
|
|
11
|
-
const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
|
|
12
|
-
const avgIn = extractField(content, "average_input_tokens_per_case");
|
|
13
|
-
const avgOut = extractField(content, "average_output_tokens_per_case");
|
|
14
|
-
return {
|
|
15
|
-
model: extractField(content, "model") ?? "unknown",
|
|
16
|
-
successRate: num("success_rate"),
|
|
17
|
-
totalCases: num("total_cases"),
|
|
18
|
-
duration: num("duration"),
|
|
19
|
-
timestamp: extractField(content, "timestamp") ?? "",
|
|
20
|
-
averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
|
|
21
|
-
averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
|
|
22
|
-
source,
|
|
23
|
-
};
|
|
24
|
-
}
|
|
25
|
-
async function findReports(dir, depth = 0) {
|
|
26
|
-
if (depth > 6)
|
|
27
|
-
return [];
|
|
28
|
-
const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
|
|
29
|
-
const results = [];
|
|
30
|
-
let entries;
|
|
31
|
-
try {
|
|
32
|
-
entries = await readdir(dir, { withFileTypes: true });
|
|
33
|
-
}
|
|
34
|
-
catch {
|
|
35
|
-
return [];
|
|
36
|
-
}
|
|
37
|
-
for (const entry of entries) {
|
|
38
|
-
if (entry.name.startsWith(".") || SKIP.has(entry.name))
|
|
39
|
-
continue;
|
|
40
|
-
const fullPath = join(dir, entry.name);
|
|
41
|
-
if (entry.isDirectory()) {
|
|
42
|
-
if (entry.name === "reports") {
|
|
43
|
-
const files = await readdir(fullPath);
|
|
44
|
-
for (const f of files) {
|
|
45
|
-
if (f.endsWith(".yaml") || f.endsWith(".yml")) {
|
|
46
|
-
results.push(join(fullPath, f));
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
else {
|
|
51
|
-
results.push(...(await findReports(fullPath, depth + 1)));
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
return results;
|
|
56
|
-
}
|
|
3
|
+
import { parseReport, findReports, loadDiffEntry, computeDiff, formatDuration, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
|
|
57
4
|
function avg(nums) {
|
|
58
5
|
return nums.length === 0
|
|
59
6
|
? undefined
|
|
@@ -75,28 +22,207 @@ function printSection(title, rows, max) {
|
|
|
75
22
|
console.log(` ${label} ${b} ${row.display}`);
|
|
76
23
|
}
|
|
77
24
|
}
|
|
78
|
-
function
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
25
|
+
function formatDelta(prev, curr) {
|
|
26
|
+
const d = (curr - prev) * 100;
|
|
27
|
+
if (Math.abs(d) < 0.5)
|
|
28
|
+
return " = ";
|
|
29
|
+
const sign = d > 0 ? "+" : "";
|
|
30
|
+
return `${sign}${d.toFixed(0)}%`.padStart(5);
|
|
31
|
+
}
|
|
32
|
+
function shortDimLabel(dim, val, maxLen = 20) {
|
|
33
|
+
const short = val.length > maxLen ? val.slice(0, maxLen - 1) + "…" : val;
|
|
34
|
+
return `${dim}:${short}`;
|
|
35
|
+
}
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Per-dimension evolution: group by held dims, show evolution along varied
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
async function printDimensionEvolution(name, runs, primaryDim, varyingDims) {
|
|
40
|
+
const groups = groupByDimension(runs, primaryDim);
|
|
41
|
+
for (const [groupVal, groupRuns] of groups) {
|
|
42
|
+
const sorted = [...groupRuns].sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
|
|
43
|
+
console.log(`\n ${name} / ${primaryDim}: ${groupVal}`);
|
|
44
|
+
console.log(" " + "─".repeat(W - 2));
|
|
45
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
46
|
+
const r = sorted[i];
|
|
47
|
+
const runNum = `#${i + 1}`.padStart(3);
|
|
48
|
+
const pct = `${(r.successRate * 100).toFixed(0).padStart(3)}%`;
|
|
49
|
+
const b = bar(r.successRate, 1, 16);
|
|
50
|
+
const delta = i === 0 ? " " : formatDelta(sorted[i - 1].successRate, r.successRate);
|
|
51
|
+
// Show other varying dimensions for this run
|
|
52
|
+
const otherDims = varyingDims
|
|
53
|
+
.filter((d) => d !== primaryDim)
|
|
54
|
+
.map((d) => shortDimLabel(d, r.dimensions?.[d] ?? "?", 12))
|
|
55
|
+
.join(" ");
|
|
56
|
+
console.log(` ${runNum} ${b} ${pct} ${delta} ${otherDims}`);
|
|
57
|
+
// Show what changed from previous run (within this group)
|
|
58
|
+
if (i > 0) {
|
|
59
|
+
const prev = sorted[i - 1];
|
|
60
|
+
const diff = diffConfigs(prev.dimensions ?? {}, r.dimensions ?? {});
|
|
61
|
+
const changedLabels = Object.entries(diff.varied)
|
|
62
|
+
.filter(([k]) => k !== primaryDim)
|
|
63
|
+
.map(([k, v]) => `${k}: ${v.from} → ${v.to}`)
|
|
64
|
+
.slice(0, 3);
|
|
65
|
+
for (const l of changedLabels) {
|
|
66
|
+
console.log(` ${l}`);
|
|
67
|
+
}
|
|
68
|
+
// Show prompt diff if prompt changed
|
|
69
|
+
if (diff.varied["prompt"] && prev.systemPromptHash && r.systemPromptHash) {
|
|
70
|
+
const [prevEntry, currEntry] = await Promise.all([
|
|
71
|
+
loadDiffEntry(prev.systemPromptHash),
|
|
72
|
+
loadDiffEntry(r.systemPromptHash),
|
|
73
|
+
]);
|
|
74
|
+
if (prevEntry && currEntry) {
|
|
75
|
+
const promptDiff = computeDiff(prevEntry, currEntry)
|
|
76
|
+
.filter((l) => l.startsWith("prompt:") || l.startsWith("tools:"));
|
|
77
|
+
for (const l of promptDiff)
|
|
78
|
+
console.log(` ${l}`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
// Cross-dimension comparison
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
function printCrossComparison(name, runs, dim) {
|
|
89
|
+
// Find configs that appear across multiple values of `dim`
|
|
90
|
+
const otherDims = Object.keys(runs[0]?.dimensions ?? {}).filter((d) => d !== dim);
|
|
91
|
+
const configKey = (r) => otherDims.map((d) => `${d}:${r.dimensions?.[d] ?? "?"}`).join("|");
|
|
92
|
+
const byConfig = new Map();
|
|
93
|
+
for (const r of runs) {
|
|
94
|
+
const ck = configKey(r);
|
|
95
|
+
const inner = byConfig.get(ck) ?? new Map();
|
|
96
|
+
const dimVal = r.dimensions?.[dim] ?? "?";
|
|
97
|
+
// Keep the latest run per (config, dimValue) pair
|
|
98
|
+
const existing = inner.get(dimVal);
|
|
99
|
+
if (!existing || new Date(r.timestamp) > new Date(existing.timestamp)) {
|
|
100
|
+
inner.set(dimVal, r);
|
|
101
|
+
}
|
|
102
|
+
byConfig.set(ck, inner);
|
|
103
|
+
}
|
|
104
|
+
// Only show configs tested across 2+ values of the dimension
|
|
105
|
+
const comparable = [...byConfig.entries()].filter(([, m]) => m.size > 1);
|
|
106
|
+
if (comparable.length === 0)
|
|
107
|
+
return;
|
|
108
|
+
console.log(`\n Cross-${dim} comparison: ${name}`);
|
|
109
|
+
console.log(" " + "─".repeat(W - 2));
|
|
110
|
+
for (const [ck, dimMap] of comparable) {
|
|
111
|
+
const configLabel = ck
|
|
112
|
+
.split("|")
|
|
113
|
+
.map((s) => s.length > 30 ? s.slice(0, 29) + "…" : s)
|
|
114
|
+
.join(" + ");
|
|
115
|
+
console.log(` ${configLabel}`);
|
|
116
|
+
const entries = [...dimMap.entries()].sort((a, b) => b[1].successRate - a[1].successRate);
|
|
117
|
+
const best = entries[0][1].successRate;
|
|
118
|
+
for (const [dimVal, r] of entries) {
|
|
119
|
+
const pct = `${(r.successRate * 100).toFixed(0)}%`.padStart(4);
|
|
120
|
+
const label = dimVal.length > 28 ? dimVal.slice(0, 27) + "…" : dimVal;
|
|
121
|
+
const vs = r.successRate === best ? "" : ` (${((r.successRate - best) * 100).toFixed(0)}%)`;
|
|
122
|
+
console.log(` ${label.padEnd(30)} ${pct}${vs}`);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
// Attribution summary
|
|
128
|
+
// ---------------------------------------------------------------------------
|
|
129
|
+
function printAttribution(name, runs) {
|
|
130
|
+
const pairs = findControlledPairs(runs);
|
|
131
|
+
if (pairs.length === 0)
|
|
132
|
+
return;
|
|
133
|
+
// Group by dimension
|
|
134
|
+
const byDim = new Map();
|
|
135
|
+
for (const p of pairs) {
|
|
136
|
+
const entry = byDim.get(p.variedDimension) ?? { deltas: [], examples: [] };
|
|
137
|
+
entry.deltas.push(p.delta);
|
|
138
|
+
if (entry.examples.length < 2) {
|
|
139
|
+
const d = (p.delta * 100).toFixed(0);
|
|
140
|
+
const sign = p.delta > 0 ? "+" : "";
|
|
141
|
+
entry.examples.push(`${p.variedFrom} → ${p.variedTo}: ${sign}${d}%`);
|
|
142
|
+
}
|
|
143
|
+
byDim.set(p.variedDimension, entry);
|
|
144
|
+
}
|
|
145
|
+
console.log(`\n Dimension Impact: ${name}`);
|
|
146
|
+
console.log(" " + "─".repeat(W - 2));
|
|
147
|
+
const sorted = [...byDim.entries()].sort((a, b) => Math.max(...b[1].deltas.map(Math.abs)) - Math.max(...a[1].deltas.map(Math.abs)));
|
|
148
|
+
for (const [dim, { deltas, examples }] of sorted) {
|
|
149
|
+
const avgDelta = deltas.reduce((a, b) => a + b, 0) / deltas.length;
|
|
150
|
+
const sign = avgDelta > 0 ? "+" : "";
|
|
151
|
+
const avgStr = `${sign}${(avgDelta * 100).toFixed(0)}%`;
|
|
152
|
+
console.log(` ${dim.padEnd(12)} ${avgStr.padStart(6)} avg (${deltas.length} comparison${deltas.length !== 1 ? "s" : ""})`);
|
|
153
|
+
for (const ex of examples) {
|
|
154
|
+
console.log(` ${ex}`);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// ---------------------------------------------------------------------------
|
|
159
|
+
// Purge
|
|
160
|
+
// ---------------------------------------------------------------------------
|
|
161
|
+
async function purge(cwd) {
|
|
162
|
+
const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
|
|
163
|
+
let count = 0;
|
|
164
|
+
async function walk(dir, depth = 0) {
|
|
165
|
+
if (depth > 6)
|
|
166
|
+
return;
|
|
167
|
+
let entries;
|
|
168
|
+
try {
|
|
169
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
170
|
+
}
|
|
171
|
+
catch {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
for (const entry of entries) {
|
|
175
|
+
if (SKIP.has(entry.name))
|
|
176
|
+
continue;
|
|
177
|
+
const fullPath = join(dir, entry.name);
|
|
178
|
+
if (entry.isDirectory()) {
|
|
179
|
+
if (entry.name === ".reports" || entry.name === ".diff") {
|
|
180
|
+
await rm(fullPath, { recursive: true, force: true });
|
|
181
|
+
console.log(` removed ${relative(cwd, fullPath)}/`);
|
|
182
|
+
count++;
|
|
183
|
+
}
|
|
184
|
+
else if (!entry.name.startsWith(".")) {
|
|
185
|
+
await walk(fullPath, depth + 1);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
await walk(cwd);
|
|
191
|
+
console.log(`\n Purged ${count} director${count !== 1 ? "ies" : "y"}.\n`);
|
|
86
192
|
}
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
// Main
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
87
196
|
async function main() {
|
|
197
|
+
const args = process.argv.slice(2);
|
|
198
|
+
const agentFlagIdx = args.indexOf("--agent");
|
|
199
|
+
const agentFilter = agentFlagIdx !== -1 ? args[agentFlagIdx + 1] : undefined;
|
|
200
|
+
if (args.includes("--purge")) {
|
|
201
|
+
await purge(process.cwd());
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
88
204
|
const cwd = process.cwd();
|
|
89
205
|
const files = await findReports(cwd);
|
|
90
206
|
if (files.length === 0) {
|
|
91
207
|
console.log("\n No reports found. Run some agent tests first.\n");
|
|
92
208
|
return;
|
|
93
209
|
}
|
|
94
|
-
|
|
210
|
+
let reports = await Promise.all(files.map(async (f) => {
|
|
95
211
|
const content = await readFile(f, "utf-8");
|
|
96
212
|
return parseReport(content, relative(cwd, f));
|
|
97
213
|
}));
|
|
214
|
+
// Ensure all reports have dimensions (backward compat)
|
|
215
|
+
await Promise.all(reports.map((r) => ensureDimensions(r)));
|
|
216
|
+
if (agentFilter) {
|
|
217
|
+
reports = reports.filter((r) => r.name?.toLowerCase() === agentFilter.toLowerCase());
|
|
218
|
+
if (reports.length === 0) {
|
|
219
|
+
console.log(`\n No reports found for agent "${agentFilter}".\n`);
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
98
223
|
console.log("\n" + "━".repeat(W));
|
|
99
|
-
|
|
224
|
+
const filterLabel = agentFilter ? ` · agent: ${agentFilter}` : "";
|
|
225
|
+
console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found${filterLabel}`);
|
|
100
226
|
console.log("━".repeat(W));
|
|
101
227
|
// Aggregate by model
|
|
102
228
|
const byModel = new Map();
|
|
@@ -148,6 +274,37 @@ async function main() {
|
|
|
148
274
|
value: a.avgDuration,
|
|
149
275
|
display: formatDuration(a.avgDuration).padStart(8),
|
|
150
276
|
})), maxDuration);
|
|
277
|
+
// Dimension-aware evolution — named agents with more than one run
|
|
278
|
+
const named = reports.filter((r) => r.name);
|
|
279
|
+
const byAgentName = new Map();
|
|
280
|
+
for (const r of named) {
|
|
281
|
+
const arr = byAgentName.get(r.name) ?? [];
|
|
282
|
+
arr.push(r);
|
|
283
|
+
byAgentName.set(r.name, arr);
|
|
284
|
+
}
|
|
285
|
+
const evolvingAgents = [...byAgentName.entries()].filter(([, runs]) => runs.length > 1);
|
|
286
|
+
if (evolvingAgents.length > 0) {
|
|
287
|
+
console.log(`\n ${"─".repeat(W - 2)}`);
|
|
288
|
+
console.log(` EVOLUTION · dimension-aware grouping`);
|
|
289
|
+
for (const [name, runs] of evolvingAgents) {
|
|
290
|
+
const varyingDims = findVaryingDimensions(runs);
|
|
291
|
+
if (varyingDims.length === 0) {
|
|
292
|
+
// All runs have identical config — just show flat timeline
|
|
293
|
+
await printDimensionEvolution(name, runs, "model", []);
|
|
294
|
+
}
|
|
295
|
+
else {
|
|
296
|
+
// Group by the primary varying dimension (most unique values)
|
|
297
|
+
const primaryDim = varyingDims[0];
|
|
298
|
+
await printDimensionEvolution(name, runs, primaryDim, varyingDims);
|
|
299
|
+
// Cross-comparison for the primary varying dimension
|
|
300
|
+
if (varyingDims.length >= 2) {
|
|
301
|
+
printCrossComparison(name, runs, primaryDim);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
// Attribution summary from controlled pairs
|
|
305
|
+
printAttribution(name, runs);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
151
308
|
console.log("\n" +
|
|
152
309
|
"━".repeat(W) +
|
|
153
310
|
`\n ${agg.length} model${agg.length !== 1 ? "s" : ""} · ${reports.length} total runs\n` +
|
package/dist/types.d.ts
CHANGED
|
@@ -20,6 +20,14 @@ export interface SceneDefinition {
|
|
|
20
20
|
field: string;
|
|
21
21
|
fn: (value: any) => void;
|
|
22
22
|
}>;
|
|
23
|
+
timeout?: number;
|
|
24
|
+
turns?: number;
|
|
25
|
+
}
|
|
26
|
+
export type JudgeVerdict = "pass" | "fail" | "partial";
|
|
27
|
+
export interface JudgeResult {
|
|
28
|
+
verdict: JudgeVerdict;
|
|
29
|
+
reasoning: string;
|
|
30
|
+
criteria: string;
|
|
23
31
|
}
|
|
24
32
|
export interface SceneResult {
|
|
25
33
|
prompt: string;
|
|
@@ -27,10 +35,14 @@ export interface SceneResult {
|
|
|
27
35
|
duration: number;
|
|
28
36
|
passed: boolean;
|
|
29
37
|
error?: string;
|
|
38
|
+
judgement?: JudgeResult;
|
|
30
39
|
}
|
|
31
40
|
export interface AgentReport {
|
|
41
|
+
name?: string;
|
|
32
42
|
model?: string;
|
|
33
43
|
systemPromptHash?: string;
|
|
44
|
+
promptHash?: string;
|
|
45
|
+
dimensions?: Record<string, string>;
|
|
34
46
|
tools?: string[];
|
|
35
47
|
successRate: number;
|
|
36
48
|
failedCases: string[];
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sebastiantuyu/agest",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "A testing library for agents",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -23,11 +23,15 @@
|
|
|
23
23
|
}
|
|
24
24
|
},
|
|
25
25
|
"scripts": {
|
|
26
|
-
"build": "tsc",
|
|
27
|
-
"test": "
|
|
26
|
+
"build": "tsc -p tsconfig.build.json",
|
|
27
|
+
"test": "vitest run",
|
|
28
|
+
"test:watch": "vitest",
|
|
29
|
+
"test:coverage": "vitest run --coverage",
|
|
28
30
|
"dev": "tsx examples/basic.test.ts",
|
|
29
31
|
"test:examples": "tsx examples/basic.test.ts && tsx examples/agent.test.ts",
|
|
30
32
|
"stats": "tsx src/stats.ts",
|
|
33
|
+
"preview": "tsx src/preview.ts",
|
|
34
|
+
"site:preview": "npx serve site -p 3000",
|
|
31
35
|
"release:patch": "npm version patch && git push && git push --tags",
|
|
32
36
|
"release:minor": "npm version minor && git push && git push --tags",
|
|
33
37
|
"release:major": "npm version major && git push && git push --tags"
|
|
@@ -40,10 +44,15 @@
|
|
|
40
44
|
"@langchain/langgraph": "^1.2.8",
|
|
41
45
|
"@langchain/openai": "^1.4.4",
|
|
42
46
|
"@types/node": "^22.0.0",
|
|
47
|
+
"@vitest/coverage-v8": "^3",
|
|
43
48
|
"dotenv": "^17.4.1",
|
|
44
49
|
"langchain": "^1.3.1",
|
|
45
50
|
"tsx": "^4.21.0",
|
|
46
51
|
"typescript": "^5.4.0",
|
|
52
|
+
"vitest": "^3",
|
|
47
53
|
"zod": "^4.3.6"
|
|
54
|
+
},
|
|
55
|
+
"dependencies": {
|
|
56
|
+
"@supercharge/promise-pool": "^3.3.0"
|
|
48
57
|
}
|
|
49
58
|
}
|