@sebastiantuyu/agest 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
1
  import type { AgentReport } from "./types";
2
2
  export declare function formatReport(report: AgentReport): string;
3
- export declare function writeReport(content: string, timestamp: string): Promise<string>;
3
+ export declare function writeReport(content: string, timestamp: string, name?: string, dimensions?: Record<string, string>): Promise<string>;
4
+ export declare function writeDiffEntry(hash: string, systemPrompt: string, tools: string[], model?: string): Promise<void>;
package/dist/reporter.js CHANGED
@@ -1,15 +1,18 @@
1
- import { mkdir, writeFile } from "fs/promises";
1
+ import { access, mkdir, writeFile } from "fs/promises";
2
+ import { createHash } from "crypto";
2
3
  import { join } from "path";
3
4
  export function formatReport(report) {
4
- const lines = [
5
- "agent:",
6
- ` model: "${report.model ?? "unknown"}"`,
7
- ` system_prompt: ${report.systemPromptHash ?? "<unknown>"}`,
8
- ` tools: ${JSON.stringify(report.tools ?? [])}`,
9
- ` success_rate: ${report.successRate}`,
10
- ` failed_cases_count: ${report.failedCases.length}`,
11
- ` failed_cases:`,
12
- ];
5
+ const lines = ["agent:"];
6
+ if (report.name)
7
+ lines.push(` name: "${report.name}"`);
8
+ lines.push(` model: "${report.model ?? "unknown"}"`, ` system_prompt: ${report.systemPromptHash ?? "<unknown>"}`, ` prompt_hash: ${report.promptHash ?? "<unknown>"}`, ` tools: ${JSON.stringify(report.tools ?? [])}`);
9
+ if (report.dimensions && Object.keys(report.dimensions).length > 0) {
10
+ lines.push(` dimensions:`);
11
+ for (const [key, value] of Object.entries(report.dimensions)) {
12
+ lines.push(` ${key}: "${value}"`);
13
+ }
14
+ }
15
+ lines.push(` success_rate: ${report.successRate}`, ` failed_cases_count: ${report.failedCases.length}`, ` failed_cases:`);
13
16
  if (report.failedCases.length === 0) {
14
17
  lines.push(" (none)");
15
18
  }
@@ -31,12 +34,44 @@ export function formatReport(report) {
31
34
  }
32
35
  return lines.join("\n");
33
36
  }
34
- export async function writeReport(content, timestamp) {
35
- const reportsDir = join(process.cwd(), "reports");
37
+ export async function writeReport(content, timestamp, name, dimensions) {
38
+ const reportsDir = join(process.cwd(), ".reports");
36
39
  await mkdir(reportsDir, { recursive: true });
37
- const safestamp = timestamp.replace(/[:.]/g, "-");
38
- const filename = `report-${safestamp}.yaml`;
40
+ const safename = name ? `-${name.replace(/[^a-zA-Z0-9_-]/g, "_")}` : "";
41
+ let filename;
42
+ if (dimensions && Object.keys(dimensions).length > 0) {
43
+ const sorted = Object.entries(dimensions).sort(([a], [b]) => a.localeCompare(b));
44
+ const dimHash = createHash("sha256").update(JSON.stringify(sorted)).digest("hex").slice(0, 8);
45
+ filename = `report${safename}-${dimHash}.yaml`;
46
+ }
47
+ else {
48
+ const safestamp = timestamp.replace(/[:.]/g, "-");
49
+ filename = `report${safename}-${safestamp}.yaml`;
50
+ }
39
51
  const filepath = join(reportsDir, filename);
52
+ try {
53
+ await access(filepath);
54
+ console.warn(`\x1b[33m⚠ Overwriting previous report for ${name ?? "unnamed"} (same config)\x1b[0m`);
55
+ }
56
+ catch { }
40
57
  await writeFile(filepath, content, "utf-8");
41
58
  return filepath;
42
59
  }
60
+ export async function writeDiffEntry(hash, systemPrompt, tools, model) {
61
+ const diffDir = join(process.cwd(), ".diff");
62
+ await mkdir(diffDir, { recursive: true });
63
+ const filepath = join(diffDir, `${hash}.yaml`);
64
+ try {
65
+ await access(filepath);
66
+ return; // already exists — skip
67
+ }
68
+ catch { }
69
+ const lines = [
70
+ `system_prompt: |`,
71
+ ...systemPrompt.split("\n").map((l) => ` ${l}`),
72
+ `tools: ${JSON.stringify(tools)}`,
73
+ ];
74
+ if (model)
75
+ lines.push(`model: "${model}"`);
76
+ await writeFile(filepath, lines.join("\n"), "utf-8");
77
+ }
@@ -0,0 +1,78 @@
1
+ export interface ParsedReport {
2
+ name?: string;
3
+ systemPromptHash?: string;
4
+ promptHash?: string;
5
+ dimensions?: Record<string, string>;
6
+ tools?: string[];
7
+ model: string;
8
+ successRate: number;
9
+ totalCases: number;
10
+ failedCasesCount: number;
11
+ failedCases: Array<{
12
+ prompt: string;
13
+ reason?: string;
14
+ }>;
15
+ duration: number;
16
+ timestamp: string;
17
+ averageInputTokensPerCase?: number;
18
+ averageOutputTokensPerCase?: number;
19
+ source: string;
20
+ }
21
+ export interface DiffEntry {
22
+ systemPrompt: string;
23
+ tools: string[];
24
+ model?: string;
25
+ }
26
+ export declare function extractField(content: string, key: string): string | undefined;
27
+ export declare function parseFailedCases(content: string): Array<{
28
+ prompt: string;
29
+ reason?: string;
30
+ }>;
31
+ export declare function parseDimensions(content: string): Record<string, string> | undefined;
32
+ export declare function parseReport(content: string, source: string): ParsedReport;
33
+ export declare function findReports(dir: string, depth?: number): Promise<string[]>;
34
+ export declare function loadDiffEntry(hash: string): Promise<DiffEntry | null>;
35
+ export declare function computeDiff(a: DiffEntry, b: DiffEntry): string[];
36
+ export interface ConfigDiff {
37
+ held: Record<string, string>;
38
+ varied: Record<string, {
39
+ from: string;
40
+ to: string;
41
+ }>;
42
+ changedCount: number;
43
+ }
44
+ export interface ControlledComparison {
45
+ a: ParsedReport;
46
+ b: ParsedReport;
47
+ variedDimension: string;
48
+ variedFrom: string;
49
+ variedTo: string;
50
+ delta: number;
51
+ }
52
+ /**
53
+ * Reconstruct dimensions from legacy report fields (backward compat).
54
+ * For old reports that lack the `dimensions` block, we build one from
55
+ * model, promptHash (or systemPromptHash + diff entry), and tools.
56
+ */
57
+ export declare function ensureDimensions(report: ParsedReport): Promise<Record<string, string>>;
58
+ /**
59
+ * Diff two config maps generically. Returns which dimensions were
60
+ * held constant vs varied, without knowing what the dimensions are.
61
+ */
62
+ export declare function diffConfigs(a: Record<string, string>, b: Record<string, string>): ConfigDiff;
63
+ /**
64
+ * Find all report pairs within the same agent where exactly one
65
+ * dimension differs. These are "controlled comparisons" — the delta
66
+ * can be cleanly attributed to the single varied dimension.
67
+ */
68
+ export declare function findControlledPairs(reports: ParsedReport[]): ControlledComparison[];
69
+ /**
70
+ * Detect which dimensions vary across a set of reports.
71
+ * Returns dimension names sorted by number of unique values (most varying first).
72
+ */
73
+ export declare function findVaryingDimensions(reports: ParsedReport[]): string[];
74
+ /**
75
+ * Group reports by the value of a specific dimension.
76
+ */
77
+ export declare function groupByDimension(reports: ParsedReport[], dimension: string): Map<string, ParsedReport[]>;
78
+ export declare function formatDuration(ms: number): string;
@@ -0,0 +1,278 @@
1
+ import { createHash } from "crypto";
2
+ import { readdir, readFile } from "fs/promises";
3
+ import { join } from "path";
4
+ export function extractField(content, key) {
5
+ const regex = new RegExp(`^ ${key}:\\s*(.+)$`, "m");
6
+ const match = content.match(regex);
7
+ if (!match)
8
+ return undefined;
9
+ return match[1].replace(/^"|"$/g, "").trim();
10
+ }
11
+ export function parseFailedCases(content) {
12
+ const lines = content.split("\n");
13
+ const startIdx = lines.findIndex((l) => l.trimStart().startsWith("failed_cases:"));
14
+ if (startIdx === -1)
15
+ return [];
16
+ const cases = [];
17
+ for (let i = startIdx + 1; i < lines.length; i++) {
18
+ const line = lines[i];
19
+ if (!line.startsWith(" "))
20
+ break;
21
+ const promptMatch = line.match(/^\s+- "(.+)"$/);
22
+ if (promptMatch) {
23
+ const next = lines[i + 1];
24
+ const reasonMatch = next?.match(/^\s+reason: "(.+)"$/);
25
+ cases.push({ prompt: promptMatch[1], reason: reasonMatch?.[1] });
26
+ }
27
+ }
28
+ return cases;
29
+ }
30
+ export function parseDimensions(content) {
31
+ const lines = content.split("\n");
32
+ const startIdx = lines.findIndex((l) => l.trimStart().startsWith("dimensions:"));
33
+ if (startIdx === -1)
34
+ return undefined;
35
+ const dims = {};
36
+ for (let i = startIdx + 1; i < lines.length; i++) {
37
+ const line = lines[i];
38
+ if (!line.startsWith(" "))
39
+ break;
40
+ const match = line.match(/^\s+(\w+):\s*"?(.+?)"?\s*$/);
41
+ if (match) {
42
+ dims[match[1]] = match[2];
43
+ }
44
+ }
45
+ return Object.keys(dims).length > 0 ? dims : undefined;
46
+ }
47
+ export function parseReport(content, source) {
48
+ const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
49
+ const avgIn = extractField(content, "average_input_tokens_per_case");
50
+ const avgOut = extractField(content, "average_output_tokens_per_case");
51
+ const toolsRaw = extractField(content, "tools");
52
+ const tools = toolsRaw
53
+ ? (() => {
54
+ try {
55
+ return JSON.parse(toolsRaw);
56
+ }
57
+ catch {
58
+ return undefined;
59
+ }
60
+ })()
61
+ : undefined;
62
+ const model = extractField(content, "model") ?? "unknown";
63
+ const promptHash = extractField(content, "prompt_hash");
64
+ const systemPromptHash = extractField(content, "system_prompt");
65
+ const dimensions = parseDimensions(content);
66
+ return {
67
+ name: extractField(content, "name"),
68
+ systemPromptHash,
69
+ promptHash,
70
+ dimensions,
71
+ tools,
72
+ model,
73
+ successRate: num("success_rate"),
74
+ totalCases: num("total_cases"),
75
+ failedCasesCount: parseInt(extractField(content, "failed_cases_count") ?? "0", 10),
76
+ failedCases: parseFailedCases(content),
77
+ duration: num("duration"),
78
+ timestamp: extractField(content, "timestamp") ?? "",
79
+ averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
80
+ averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
81
+ source,
82
+ };
83
+ }
84
+ export async function findReports(dir, depth = 0) {
85
+ if (depth > 6)
86
+ return [];
87
+ const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
88
+ const results = [];
89
+ let entries;
90
+ try {
91
+ entries = await readdir(dir, { withFileTypes: true });
92
+ }
93
+ catch {
94
+ return [];
95
+ }
96
+ for (const entry of entries) {
97
+ if (SKIP.has(entry.name))
98
+ continue;
99
+ const fullPath = join(dir, entry.name);
100
+ if (entry.isDirectory()) {
101
+ if (entry.name === ".reports") {
102
+ const files = await readdir(fullPath);
103
+ for (const f of files) {
104
+ if (f.endsWith(".yaml") || f.endsWith(".yml")) {
105
+ results.push(join(fullPath, f));
106
+ }
107
+ }
108
+ }
109
+ else if (!entry.name.startsWith(".")) {
110
+ results.push(...(await findReports(fullPath, depth + 1)));
111
+ }
112
+ }
113
+ }
114
+ return results;
115
+ }
116
+ export async function loadDiffEntry(hash) {
117
+ try {
118
+ const content = await readFile(join(process.cwd(), ".diff", `${hash}.yaml`), "utf-8");
119
+ const promptMatch = content.match(/^system_prompt: \|\n([\s\S]*?)(?=\ntools:)/m);
120
+ const toolsMatch = content.match(/^tools: (.+)$/m);
121
+ const modelMatch = content.match(/^model: "(.+)"$/m);
122
+ return {
123
+ systemPrompt: promptMatch
124
+ ? promptMatch[1].replace(/^ /gm, "").trimEnd()
125
+ : "",
126
+ tools: toolsMatch ? JSON.parse(toolsMatch[1]) : [],
127
+ model: modelMatch ? modelMatch[1] : undefined,
128
+ };
129
+ }
130
+ catch {
131
+ return null;
132
+ }
133
+ }
134
+ export function computeDiff(a, b) {
135
+ const lines = [];
136
+ if (a.model !== b.model) {
137
+ if (a.model)
138
+ lines.push(`model: - "${a.model}"`);
139
+ if (b.model)
140
+ lines.push(`model: + "${b.model}"`);
141
+ }
142
+ const added = b.tools.filter((t) => !a.tools.includes(t));
143
+ const removed = a.tools.filter((t) => !b.tools.includes(t));
144
+ if (added.length)
145
+ lines.push(`tools: +[${added.join(", ")}]`);
146
+ if (removed.length)
147
+ lines.push(`tools: -[${removed.join(", ")}]`);
148
+ const aLines = new Set(a.systemPrompt.split("\n").map((l) => l.trim()).filter(Boolean));
149
+ const bLines = new Set(b.systemPrompt.split("\n").map((l) => l.trim()).filter(Boolean));
150
+ const addedLines = [...bLines].filter((l) => !aLines.has(l)).slice(0, 3);
151
+ const removedLines = [...aLines].filter((l) => !bLines.has(l)).slice(0, 3);
152
+ for (const l of addedLines)
153
+ lines.push(`prompt: + "${l.slice(0, 60)}"`);
154
+ for (const l of removedLines)
155
+ lines.push(`prompt: - "${l.slice(0, 60)}"`);
156
+ return lines;
157
+ }
158
+ /**
159
+ * Reconstruct dimensions from legacy report fields (backward compat).
160
+ * For old reports that lack the `dimensions` block, we build one from
161
+ * model, promptHash (or systemPromptHash + diff entry), and tools.
162
+ */
163
+ export async function ensureDimensions(report) {
164
+ if (report.dimensions)
165
+ return report.dimensions;
166
+ const dims = {};
167
+ dims.model = report.model;
168
+ if (report.promptHash) {
169
+ dims.prompt = report.promptHash;
170
+ }
171
+ else if (report.systemPromptHash) {
172
+ // Derive prompt-only hash from diff entry
173
+ const entry = await loadDiffEntry(report.systemPromptHash);
174
+ if (entry) {
175
+ dims.prompt = createHash("sha256").update(entry.systemPrompt).digest("hex").slice(0, 12);
176
+ report.promptHash = dims.prompt;
177
+ }
178
+ }
179
+ if (report.tools?.length) {
180
+ dims.tools = [...report.tools].sort().join(",");
181
+ }
182
+ else {
183
+ dims.tools = "none";
184
+ }
185
+ report.dimensions = dims;
186
+ return dims;
187
+ }
188
+ /**
189
+ * Diff two config maps generically. Returns which dimensions were
190
+ * held constant vs varied, without knowing what the dimensions are.
191
+ */
192
+ export function diffConfigs(a, b) {
193
+ const allKeys = new Set([...Object.keys(a), ...Object.keys(b)]);
194
+ const held = {};
195
+ const varied = {};
196
+ for (const key of allKeys) {
197
+ const av = a[key] ?? "(absent)";
198
+ const bv = b[key] ?? "(absent)";
199
+ if (av === bv) {
200
+ held[key] = av;
201
+ }
202
+ else {
203
+ varied[key] = { from: av, to: bv };
204
+ }
205
+ }
206
+ return { held, varied, changedCount: Object.keys(varied).length };
207
+ }
208
+ /**
209
+ * Find all report pairs within the same agent where exactly one
210
+ * dimension differs. These are "controlled comparisons" — the delta
211
+ * can be cleanly attributed to the single varied dimension.
212
+ */
213
+ export function findControlledPairs(reports) {
214
+ const pairs = [];
215
+ for (let i = 0; i < reports.length; i++) {
216
+ for (let j = i + 1; j < reports.length; j++) {
217
+ const a = reports[i];
218
+ const b = reports[j];
219
+ if (!a.dimensions || !b.dimensions)
220
+ continue;
221
+ const diff = diffConfigs(a.dimensions, b.dimensions);
222
+ if (diff.changedCount !== 1)
223
+ continue;
224
+ const [dimName, { from, to }] = Object.entries(diff.varied)[0];
225
+ pairs.push({
226
+ a,
227
+ b,
228
+ variedDimension: dimName,
229
+ variedFrom: from,
230
+ variedTo: to,
231
+ delta: b.successRate - a.successRate,
232
+ });
233
+ }
234
+ }
235
+ return pairs.sort((a, b) => Math.abs(b.delta) - Math.abs(a.delta));
236
+ }
237
+ /**
238
+ * Detect which dimensions vary across a set of reports.
239
+ * Returns dimension names sorted by number of unique values (most varying first).
240
+ */
241
+ export function findVaryingDimensions(reports) {
242
+ const valuesByDim = new Map();
243
+ for (const r of reports) {
244
+ if (!r.dimensions)
245
+ continue;
246
+ for (const [key, val] of Object.entries(r.dimensions)) {
247
+ const set = valuesByDim.get(key) ?? new Set();
248
+ set.add(val);
249
+ valuesByDim.set(key, set);
250
+ }
251
+ }
252
+ return [...valuesByDim.entries()]
253
+ .filter(([, vals]) => vals.size > 1)
254
+ .sort((a, b) => b[1].size - a[1].size)
255
+ .map(([key]) => key);
256
+ }
257
+ /**
258
+ * Group reports by the value of a specific dimension.
259
+ */
260
+ export function groupByDimension(reports, dimension) {
261
+ const groups = new Map();
262
+ for (const r of reports) {
263
+ const val = r.dimensions?.[dimension] ?? "(unknown)";
264
+ const arr = groups.get(val) ?? [];
265
+ arr.push(r);
266
+ groups.set(val, arr);
267
+ }
268
+ return groups;
269
+ }
270
+ export function formatDuration(ms) {
271
+ if (ms < 1000)
272
+ return `${ms.toFixed(0)}ms`;
273
+ if (ms < 60_000)
274
+ return `${(ms / 1000).toFixed(1)}s`;
275
+ const m = Math.floor(ms / 60_000);
276
+ const s = ((ms % 60_000) / 1000).toFixed(0).padStart(2, "0");
277
+ return `${m}m${s}s`;
278
+ }
package/dist/runner.d.ts CHANGED
@@ -1,3 +1,4 @@
1
1
  import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
2
+ import type { JudgeConfig } from "./config";
2
3
  export declare function extractField(response: AgentResponse, field: string): unknown;
3
- export declare function executeScene(executor: AgentExecutor, scene: SceneDefinition): Promise<SceneResult>;
4
+ export declare function executeScene(executor: AgentExecutor, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number): Promise<SceneResult>;
package/dist/runner.js CHANGED
@@ -1,3 +1,6 @@
1
+ import { collectPendingJudgements } from "./assertions";
2
+ import { callJudge, resolveJudgeExecutor } from "./judge";
3
+ const DEFAULT_SCENE_TIMEOUT = 10_000;
1
4
  export function extractField(response, field) {
2
5
  switch (field) {
3
6
  case "response":
@@ -10,12 +13,24 @@ export function extractField(response, field) {
10
13
  return response.metadata?.[field];
11
14
  }
12
15
  }
13
- export async function executeScene(executor, scene) {
14
- let response;
16
+ export async function executeScene(executor, scene, globalTimeout, judgeConfig, globalTurns) {
17
+ let response = { text: "" };
15
18
  let duration;
19
+ const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
20
+ const turns = scene.turns ?? globalTurns ?? 1;
16
21
  try {
17
22
  const start = performance.now();
18
- response = await executor(scene.prompt);
23
+ let input = scene.prompt;
24
+ for (let t = 0; t < turns; t++) {
25
+ response = await Promise.race([
26
+ executor(input),
27
+ new Promise((_, reject) => setTimeout(() => reject(new Error(`Scene timed out after ${timeoutMs}ms`)), timeoutMs)),
28
+ ]);
29
+ if (response.executionError)
30
+ break;
31
+ if (t < turns - 1)
32
+ input = response.text;
33
+ }
19
34
  duration = performance.now() - start;
20
35
  }
21
36
  catch (err) {
@@ -38,6 +53,7 @@ export async function executeScene(executor, scene) {
38
53
  }
39
54
  let passed = true;
40
55
  let error;
56
+ let judgement;
41
57
  for (const assertion of scene.assertions) {
42
58
  try {
43
59
  const value = extractField(response, assertion.field);
@@ -49,5 +65,31 @@ export async function executeScene(executor, scene) {
49
65
  break;
50
66
  }
51
67
  }
52
- return { prompt: scene.prompt, response, duration, passed, error };
68
+ const pending = collectPendingJudgements();
69
+ if (pending.length > 0 && passed) {
70
+ if (!judgeConfig) {
71
+ passed = false;
72
+ error = "judgedBy() requires a judge configured in agest.config.ts";
73
+ }
74
+ else {
75
+ const judgeExecutor = resolveJudgeExecutor(judgeConfig);
76
+ for (const p of pending) {
77
+ try {
78
+ const result = await callJudge(String(p.value), p.criteria, judgeExecutor);
79
+ judgement = result;
80
+ if (result.verdict === "fail" || result.verdict === "partial") {
81
+ passed = false;
82
+ error = `Judge verdict: ${result.verdict} — ${result.reasoning}`;
83
+ break;
84
+ }
85
+ }
86
+ catch (err) {
87
+ passed = false;
88
+ error = `Judge error: ${err.message}`;
89
+ break;
90
+ }
91
+ }
92
+ }
93
+ }
94
+ return { prompt: scene.prompt, response, duration, passed, error, judgement };
53
95
  }