@sebastiantuyu/agest 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/stats.js CHANGED
@@ -1,59 +1,6 @@
1
- import { readdir, readFile } from "fs/promises";
1
+ import { readdir, readFile, rm } from "fs/promises";
2
2
  import { join, relative } from "path";
3
- function extractField(content, key) {
4
- const regex = new RegExp(`^ ${key}:\\s*(.+)$`, "m");
5
- const match = content.match(regex);
6
- if (!match)
7
- return undefined;
8
- return match[1].replace(/^"|"$/g, "").trim();
9
- }
10
- function parseReport(content, source) {
11
- const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
12
- const avgIn = extractField(content, "average_input_tokens_per_case");
13
- const avgOut = extractField(content, "average_output_tokens_per_case");
14
- return {
15
- model: extractField(content, "model") ?? "unknown",
16
- successRate: num("success_rate"),
17
- totalCases: num("total_cases"),
18
- duration: num("duration"),
19
- timestamp: extractField(content, "timestamp") ?? "",
20
- averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
21
- averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
22
- source,
23
- };
24
- }
25
- async function findReports(dir, depth = 0) {
26
- if (depth > 6)
27
- return [];
28
- const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
29
- const results = [];
30
- let entries;
31
- try {
32
- entries = await readdir(dir, { withFileTypes: true });
33
- }
34
- catch {
35
- return [];
36
- }
37
- for (const entry of entries) {
38
- if (entry.name.startsWith(".") || SKIP.has(entry.name))
39
- continue;
40
- const fullPath = join(dir, entry.name);
41
- if (entry.isDirectory()) {
42
- if (entry.name === "reports") {
43
- const files = await readdir(fullPath);
44
- for (const f of files) {
45
- if (f.endsWith(".yaml") || f.endsWith(".yml")) {
46
- results.push(join(fullPath, f));
47
- }
48
- }
49
- }
50
- else {
51
- results.push(...(await findReports(fullPath, depth + 1)));
52
- }
53
- }
54
- }
55
- return results;
56
- }
3
+ import { parseReport, findReports, loadDiffEntry, computeDiff, formatDuration, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
57
4
  function avg(nums) {
58
5
  return nums.length === 0
59
6
  ? undefined
@@ -75,28 +22,207 @@ function printSection(title, rows, max) {
75
22
  console.log(` ${label} ${b} ${row.display}`);
76
23
  }
77
24
  }
78
- function formatDuration(ms) {
79
- if (ms < 1000)
80
- return `${ms.toFixed(0)}ms`;
81
- if (ms < 60_000)
82
- return `${(ms / 1000).toFixed(1)}s`;
83
- const m = Math.floor(ms / 60_000);
84
- const s = ((ms % 60_000) / 1000).toFixed(0).padStart(2, "0");
85
- return `${m}m${s}s`;
25
+ function formatDelta(prev, curr) {
26
+ const d = (curr - prev) * 100;
27
+ if (Math.abs(d) < 0.5)
28
+ return " = ";
29
+ const sign = d > 0 ? "+" : "";
30
+ return `${sign}${d.toFixed(0)}%`.padStart(5);
31
+ }
32
+ function shortDimLabel(dim, val, maxLen = 20) {
33
+ const short = val.length > maxLen ? val.slice(0, maxLen - 1) + "…" : val;
34
+ return `${dim}:${short}`;
35
+ }
36
+ // ---------------------------------------------------------------------------
37
+ // Per-dimension evolution: group by held dims, show evolution along varied
38
+ // ---------------------------------------------------------------------------
39
+ async function printDimensionEvolution(name, runs, primaryDim, varyingDims) {
40
+ const groups = groupByDimension(runs, primaryDim);
41
+ for (const [groupVal, groupRuns] of groups) {
42
+ const sorted = [...groupRuns].sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
43
+ console.log(`\n ${name} / ${primaryDim}: ${groupVal}`);
44
+ console.log(" " + "─".repeat(W - 2));
45
+ for (let i = 0; i < sorted.length; i++) {
46
+ const r = sorted[i];
47
+ const runNum = `#${i + 1}`.padStart(3);
48
+ const pct = `${(r.successRate * 100).toFixed(0).padStart(3)}%`;
49
+ const b = bar(r.successRate, 1, 16);
50
+ const delta = i === 0 ? " " : formatDelta(sorted[i - 1].successRate, r.successRate);
51
+ // Show other varying dimensions for this run
52
+ const otherDims = varyingDims
53
+ .filter((d) => d !== primaryDim)
54
+ .map((d) => shortDimLabel(d, r.dimensions?.[d] ?? "?", 12))
55
+ .join(" ");
56
+ console.log(` ${runNum} ${b} ${pct} ${delta} ${otherDims}`);
57
+ // Show what changed from previous run (within this group)
58
+ if (i > 0) {
59
+ const prev = sorted[i - 1];
60
+ const diff = diffConfigs(prev.dimensions ?? {}, r.dimensions ?? {});
61
+ const changedLabels = Object.entries(diff.varied)
62
+ .filter(([k]) => k !== primaryDim)
63
+ .map(([k, v]) => `${k}: ${v.from} → ${v.to}`)
64
+ .slice(0, 3);
65
+ for (const l of changedLabels) {
66
+ console.log(` ${l}`);
67
+ }
68
+ // Show prompt diff if prompt changed
69
+ if (diff.varied["prompt"] && prev.systemPromptHash && r.systemPromptHash) {
70
+ const [prevEntry, currEntry] = await Promise.all([
71
+ loadDiffEntry(prev.systemPromptHash),
72
+ loadDiffEntry(r.systemPromptHash),
73
+ ]);
74
+ if (prevEntry && currEntry) {
75
+ const promptDiff = computeDiff(prevEntry, currEntry)
76
+ .filter((l) => l.startsWith("prompt:") || l.startsWith("tools:"));
77
+ for (const l of promptDiff)
78
+ console.log(` ${l}`);
79
+ }
80
+ }
81
+ }
82
+ }
83
+ }
84
+ }
85
+ // ---------------------------------------------------------------------------
86
+ // Cross-dimension comparison
87
+ // ---------------------------------------------------------------------------
88
+ function printCrossComparison(name, runs, dim) {
89
+ // Find configs that appear across multiple values of `dim`
90
+ const otherDims = Object.keys(runs[0]?.dimensions ?? {}).filter((d) => d !== dim);
91
+ const configKey = (r) => otherDims.map((d) => `${d}:${r.dimensions?.[d] ?? "?"}`).join("|");
92
+ const byConfig = new Map();
93
+ for (const r of runs) {
94
+ const ck = configKey(r);
95
+ const inner = byConfig.get(ck) ?? new Map();
96
+ const dimVal = r.dimensions?.[dim] ?? "?";
97
+ // Keep the latest run per (config, dimValue) pair
98
+ const existing = inner.get(dimVal);
99
+ if (!existing || new Date(r.timestamp) > new Date(existing.timestamp)) {
100
+ inner.set(dimVal, r);
101
+ }
102
+ byConfig.set(ck, inner);
103
+ }
104
+ // Only show configs tested across 2+ values of the dimension
105
+ const comparable = [...byConfig.entries()].filter(([, m]) => m.size > 1);
106
+ if (comparable.length === 0)
107
+ return;
108
+ console.log(`\n Cross-${dim} comparison: ${name}`);
109
+ console.log(" " + "─".repeat(W - 2));
110
+ for (const [ck, dimMap] of comparable) {
111
+ const configLabel = ck
112
+ .split("|")
113
+ .map((s) => s.length > 30 ? s.slice(0, 29) + "…" : s)
114
+ .join(" + ");
115
+ console.log(` ${configLabel}`);
116
+ const entries = [...dimMap.entries()].sort((a, b) => b[1].successRate - a[1].successRate);
117
+ const best = entries[0][1].successRate;
118
+ for (const [dimVal, r] of entries) {
119
+ const pct = `${(r.successRate * 100).toFixed(0)}%`.padStart(4);
120
+ const label = dimVal.length > 28 ? dimVal.slice(0, 27) + "…" : dimVal;
121
+ const vs = r.successRate === best ? "" : ` (${((r.successRate - best) * 100).toFixed(0)}%)`;
122
+ console.log(` ${label.padEnd(30)} ${pct}${vs}`);
123
+ }
124
+ }
125
+ }
126
+ // ---------------------------------------------------------------------------
127
+ // Attribution summary
128
+ // ---------------------------------------------------------------------------
129
+ function printAttribution(name, runs) {
130
+ const pairs = findControlledPairs(runs);
131
+ if (pairs.length === 0)
132
+ return;
133
+ // Group by dimension
134
+ const byDim = new Map();
135
+ for (const p of pairs) {
136
+ const entry = byDim.get(p.variedDimension) ?? { deltas: [], examples: [] };
137
+ entry.deltas.push(p.delta);
138
+ if (entry.examples.length < 2) {
139
+ const d = (p.delta * 100).toFixed(0);
140
+ const sign = p.delta > 0 ? "+" : "";
141
+ entry.examples.push(`${p.variedFrom} → ${p.variedTo}: ${sign}${d}%`);
142
+ }
143
+ byDim.set(p.variedDimension, entry);
144
+ }
145
+ console.log(`\n Dimension Impact: ${name}`);
146
+ console.log(" " + "─".repeat(W - 2));
147
+ const sorted = [...byDim.entries()].sort((a, b) => Math.max(...b[1].deltas.map(Math.abs)) - Math.max(...a[1].deltas.map(Math.abs)));
148
+ for (const [dim, { deltas, examples }] of sorted) {
149
+ const avgDelta = deltas.reduce((a, b) => a + b, 0) / deltas.length;
150
+ const sign = avgDelta > 0 ? "+" : "";
151
+ const avgStr = `${sign}${(avgDelta * 100).toFixed(0)}%`;
152
+ console.log(` ${dim.padEnd(12)} ${avgStr.padStart(6)} avg (${deltas.length} comparison${deltas.length !== 1 ? "s" : ""})`);
153
+ for (const ex of examples) {
154
+ console.log(` ${ex}`);
155
+ }
156
+ }
157
+ }
158
+ // ---------------------------------------------------------------------------
159
+ // Purge
160
+ // ---------------------------------------------------------------------------
161
+ async function purge(cwd) {
162
+ const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
163
+ let count = 0;
164
+ async function walk(dir, depth = 0) {
165
+ if (depth > 6)
166
+ return;
167
+ let entries;
168
+ try {
169
+ entries = await readdir(dir, { withFileTypes: true });
170
+ }
171
+ catch {
172
+ return;
173
+ }
174
+ for (const entry of entries) {
175
+ if (SKIP.has(entry.name))
176
+ continue;
177
+ const fullPath = join(dir, entry.name);
178
+ if (entry.isDirectory()) {
179
+ if (entry.name === ".reports" || entry.name === ".diff") {
180
+ await rm(fullPath, { recursive: true, force: true });
181
+ console.log(` removed ${relative(cwd, fullPath)}/`);
182
+ count++;
183
+ }
184
+ else if (!entry.name.startsWith(".")) {
185
+ await walk(fullPath, depth + 1);
186
+ }
187
+ }
188
+ }
189
+ }
190
+ await walk(cwd);
191
+ console.log(`\n Purged ${count} director${count !== 1 ? "ies" : "y"}.\n`);
86
192
  }
193
+ // ---------------------------------------------------------------------------
194
+ // Main
195
+ // ---------------------------------------------------------------------------
87
196
  async function main() {
197
+ const args = process.argv.slice(2);
198
+ const agentFlagIdx = args.indexOf("--agent");
199
+ const agentFilter = agentFlagIdx !== -1 ? args[agentFlagIdx + 1] : undefined;
200
+ if (args.includes("--purge")) {
201
+ await purge(process.cwd());
202
+ return;
203
+ }
88
204
  const cwd = process.cwd();
89
205
  const files = await findReports(cwd);
90
206
  if (files.length === 0) {
91
207
  console.log("\n No reports found. Run some agent tests first.\n");
92
208
  return;
93
209
  }
94
- const reports = await Promise.all(files.map(async (f) => {
210
+ let reports = await Promise.all(files.map(async (f) => {
95
211
  const content = await readFile(f, "utf-8");
96
212
  return parseReport(content, relative(cwd, f));
97
213
  }));
214
+ // Ensure all reports have dimensions (backward compat)
215
+ await Promise.all(reports.map((r) => ensureDimensions(r)));
216
+ if (agentFilter) {
217
+ reports = reports.filter((r) => r.name?.toLowerCase() === agentFilter.toLowerCase());
218
+ if (reports.length === 0) {
219
+ console.log(`\n No reports found for agent "${agentFilter}".\n`);
220
+ return;
221
+ }
222
+ }
98
223
  console.log("\n" + "━".repeat(W));
99
- console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found`);
224
+ const filterLabel = agentFilter ? ` · agent: ${agentFilter}` : "";
225
+ console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found${filterLabel}`);
100
226
  console.log("━".repeat(W));
101
227
  // Aggregate by model
102
228
  const byModel = new Map();
@@ -148,6 +274,37 @@ async function main() {
148
274
  value: a.avgDuration,
149
275
  display: formatDuration(a.avgDuration).padStart(8),
150
276
  })), maxDuration);
277
+ // Dimension-aware evolution — named agents with more than one run
278
+ const named = reports.filter((r) => r.name);
279
+ const byAgentName = new Map();
280
+ for (const r of named) {
281
+ const arr = byAgentName.get(r.name) ?? [];
282
+ arr.push(r);
283
+ byAgentName.set(r.name, arr);
284
+ }
285
+ const evolvingAgents = [...byAgentName.entries()].filter(([, runs]) => runs.length > 1);
286
+ if (evolvingAgents.length > 0) {
287
+ console.log(`\n ${"─".repeat(W - 2)}`);
288
+ console.log(` EVOLUTION · dimension-aware grouping`);
289
+ for (const [name, runs] of evolvingAgents) {
290
+ const varyingDims = findVaryingDimensions(runs);
291
+ if (varyingDims.length === 0) {
292
+ // All runs have identical config — just show flat timeline
293
+ await printDimensionEvolution(name, runs, "model", []);
294
+ }
295
+ else {
296
+ // Group by the primary varying dimension (most unique values)
297
+ const primaryDim = varyingDims[0];
298
+ await printDimensionEvolution(name, runs, primaryDim, varyingDims);
299
+ // Cross-comparison for the primary varying dimension
300
+ if (varyingDims.length >= 2) {
301
+ printCrossComparison(name, runs, primaryDim);
302
+ }
303
+ }
304
+ // Attribution summary from controlled pairs
305
+ printAttribution(name, runs);
306
+ }
307
+ }
151
308
  console.log("\n" +
152
309
  "━".repeat(W) +
153
310
  `\n ${agg.length} model${agg.length !== 1 ? "s" : ""} · ${reports.length} total runs\n` +
package/dist/types.d.ts CHANGED
@@ -20,6 +20,14 @@ export interface SceneDefinition {
20
20
  field: string;
21
21
  fn: (value: any) => void;
22
22
  }>;
23
+ timeout?: number;
24
+ turns?: number;
25
+ }
26
+ export type JudgeVerdict = "pass" | "fail" | "partial";
27
+ export interface JudgeResult {
28
+ verdict: JudgeVerdict;
29
+ reasoning: string;
30
+ criteria: string;
23
31
  }
24
32
  export interface SceneResult {
25
33
  prompt: string;
@@ -27,10 +35,14 @@ export interface SceneResult {
27
35
  duration: number;
28
36
  passed: boolean;
29
37
  error?: string;
38
+ judgement?: JudgeResult;
30
39
  }
31
40
  export interface AgentReport {
41
+ name?: string;
32
42
  model?: string;
33
43
  systemPromptHash?: string;
44
+ promptHash?: string;
45
+ dimensions?: Record<string, string>;
34
46
  tools?: string[];
35
47
  successRate: number;
36
48
  failedCases: string[];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sebastiantuyu/agest",
3
- "version": "0.1.6",
3
+ "version": "0.2.1",
4
4
  "description": "A testing library for agents",
5
5
  "repository": {
6
6
  "type": "git",
@@ -23,11 +23,15 @@
23
23
  }
24
24
  },
25
25
  "scripts": {
26
- "build": "tsc",
27
- "test": "node dist/index.js",
26
+ "build": "tsc -p tsconfig.build.json",
27
+ "test": "vitest run",
28
+ "test:watch": "vitest",
29
+ "test:coverage": "vitest run --coverage",
28
30
  "dev": "tsx examples/basic.test.ts",
29
31
  "test:examples": "tsx examples/basic.test.ts && tsx examples/agent.test.ts",
30
32
  "stats": "tsx src/stats.ts",
33
+ "preview": "tsx src/preview.ts",
34
+ "site:preview": "npx serve site -p 3000",
31
35
  "release:patch": "npm version patch && git push && git push --tags",
32
36
  "release:minor": "npm version minor && git push && git push --tags",
33
37
  "release:major": "npm version major && git push && git push --tags"
@@ -40,10 +44,15 @@
40
44
  "@langchain/langgraph": "^1.2.8",
41
45
  "@langchain/openai": "^1.4.4",
42
46
  "@types/node": "^22.0.0",
47
+ "@vitest/coverage-v8": "^3",
43
48
  "dotenv": "^17.4.1",
44
49
  "langchain": "^1.3.1",
45
50
  "tsx": "^4.21.0",
46
51
  "typescript": "^5.4.0",
52
+ "vitest": "^3",
47
53
  "zod": "^4.3.6"
54
+ },
55
+ "dependencies": {
56
+ "@supercharge/promise-pool": "^3.3.0"
48
57
  }
49
58
  }