@sebastiantuyu/agest 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +3 -1
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +43 -0
- package/dist/context.d.ts +14 -1
- package/dist/context.js +95 -11
- package/dist/index.d.ts +7 -2
- package/dist/index.js +26 -1
- package/dist/preview.d.ts +2 -1
- package/dist/preview.js +671 -162
- package/dist/reporter.js +46 -0
- package/dist/reports.d.ts +20 -0
- package/dist/reports.js +99 -3
- package/dist/runner.js +69 -14
- package/dist/stats.d.ts +2 -1
- package/dist/stats.js +34 -5
- package/dist/types.d.ts +14 -0
- package/package.json +6 -3
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sebastian Tuyu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -116,11 +116,13 @@ npx tsx examples/openrouter.test.ts
|
|
|
116
116
|
- [x] Remote HTTP adapter for framework-agnostic testing
|
|
117
117
|
- [x] Report persistence to `.reports/` with YAML format
|
|
118
118
|
- [x] Stats CLI with multi-model comparison and dimension analysis
|
|
119
|
+
- [x] Lifecycle hooks: `beforeEach`, `beforeAll`, `afterEach`, `afterAll` supporting sync/async functions
|
|
120
|
+
- [x] Multiple test suites per agent via `suite()` to evaluate different aspects independently
|
|
121
|
+
- [x] Statistical runs: `.runs(n)` per scene with pass rate and Wilson significance scoring
|
|
119
122
|
|
|
120
123
|
### Up next
|
|
121
124
|
- [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
|
|
122
125
|
- [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
|
|
123
|
-
- [ ] Statistical runs: `.runs(n)` per scene with mean/stddev reporting
|
|
124
126
|
- [ ] Vercel AI SDK adapter
|
|
125
127
|
- [ ] Snapshot regression: diff current run against a saved baseline
|
|
126
128
|
|
|
@@ -20,7 +20,7 @@ export interface RemoteAdapterOptions {
|
|
|
20
20
|
* When omitted the adapter tries common shapes:
|
|
21
21
|
* - `{ text }` / `{ response }` / `{ output }` / `{ message }` / plain string
|
|
22
22
|
*/
|
|
23
|
-
parseResponse?: (body:
|
|
23
|
+
parseResponse?: <TBody = unknown>(body: TBody) => AgentResponse;
|
|
24
24
|
/**
|
|
25
25
|
* Static metadata for this remote agent.
|
|
26
26
|
* Because the remote endpoint is opaque, metadata like model name,
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { spawn } from "child_process";
|
|
3
|
+
import { main as stats } from "./stats.js";
|
|
4
|
+
import { main as preview } from "./preview.js";
|
|
5
|
+
const command = process.argv[2];
|
|
6
|
+
async function run() {
|
|
7
|
+
const files = process.argv.slice(3);
|
|
8
|
+
if (files.length === 0) {
|
|
9
|
+
console.error(" Usage: agest run <file...>");
|
|
10
|
+
process.exit(1);
|
|
11
|
+
}
|
|
12
|
+
for (const file of files) {
|
|
13
|
+
const child = spawn("npx", ["tsx", file], {
|
|
14
|
+
stdio: "inherit",
|
|
15
|
+
shell: true,
|
|
16
|
+
});
|
|
17
|
+
const code = await new Promise((resolve) => child.on("close", (c) => resolve(c ?? 1)));
|
|
18
|
+
if (code !== 0)
|
|
19
|
+
process.exit(code);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
const commands = {
|
|
23
|
+
stats,
|
|
24
|
+
preview,
|
|
25
|
+
run,
|
|
26
|
+
};
|
|
27
|
+
if (!command || !commands[command]) {
|
|
28
|
+
console.log(`
|
|
29
|
+
Usage: agest <command>
|
|
30
|
+
|
|
31
|
+
Commands:
|
|
32
|
+
run Run test file(s) agest run tests/*.test.ts
|
|
33
|
+
stats Show aggregated test statistics
|
|
34
|
+
preview Generate an HTML report preview
|
|
35
|
+
`);
|
|
36
|
+
process.exit(command ? 1 : 0);
|
|
37
|
+
}
|
|
38
|
+
// Forward remaining args so subcommands see them at process.argv[2+]
|
|
39
|
+
process.argv = [process.argv[0], process.argv[1], ...process.argv.slice(3)];
|
|
40
|
+
commands[command]().catch((err) => {
|
|
41
|
+
console.error("Error:", err.message);
|
|
42
|
+
process.exit(1);
|
|
43
|
+
});
|
package/dist/context.d.ts
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
|
-
import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
|
|
1
|
+
import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
|
|
2
2
|
export declare class SceneBuilder {
|
|
3
3
|
private _prompt;
|
|
4
4
|
private _assertions;
|
|
5
5
|
private _timeout?;
|
|
6
6
|
private _turns?;
|
|
7
|
+
private _runs?;
|
|
8
|
+
private _suite?;
|
|
7
9
|
constructor(_prompt: string);
|
|
8
10
|
timeout(ms: number): SceneBuilder;
|
|
9
11
|
turns(n: number): SceneBuilder;
|
|
12
|
+
runs(n: number): SceneBuilder;
|
|
13
|
+
/** @internal */
|
|
14
|
+
_setSuite(name: string): void;
|
|
10
15
|
expect(field: string, fn: (value: any) => void): SceneBuilder;
|
|
11
16
|
toDefinition(): SceneDefinition;
|
|
12
17
|
}
|
|
@@ -14,7 +19,15 @@ export declare class AgentContext {
|
|
|
14
19
|
private _executor;
|
|
15
20
|
private _name?;
|
|
16
21
|
private _scenes;
|
|
22
|
+
private _currentSuite?;
|
|
23
|
+
private _beforeAllHooks;
|
|
24
|
+
private _afterAllHooks;
|
|
25
|
+
private _beforeEachHooks;
|
|
26
|
+
private _afterEachHooks;
|
|
17
27
|
constructor(_executor: AgentExecutor, _name?: string | undefined);
|
|
28
|
+
registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
|
|
29
|
+
setSuite(name: string): void;
|
|
30
|
+
clearSuite(): void;
|
|
18
31
|
registerScene(prompt: string): SceneBuilder;
|
|
19
32
|
execute(): Promise<AgentReport>;
|
|
20
33
|
}
|
package/dist/context.js
CHANGED
|
@@ -9,6 +9,8 @@ export class SceneBuilder {
|
|
|
9
9
|
_assertions = [];
|
|
10
10
|
_timeout;
|
|
11
11
|
_turns;
|
|
12
|
+
_runs;
|
|
13
|
+
_suite;
|
|
12
14
|
constructor(_prompt) {
|
|
13
15
|
this._prompt = _prompt;
|
|
14
16
|
}
|
|
@@ -20,24 +22,56 @@ export class SceneBuilder {
|
|
|
20
22
|
this._turns = n;
|
|
21
23
|
return this;
|
|
22
24
|
}
|
|
25
|
+
runs(n) {
|
|
26
|
+
this._runs = Math.max(1, Math.round(n));
|
|
27
|
+
return this;
|
|
28
|
+
}
|
|
29
|
+
/** @internal */
|
|
30
|
+
_setSuite(name) {
|
|
31
|
+
this._suite = name;
|
|
32
|
+
}
|
|
23
33
|
expect(field, fn) {
|
|
24
34
|
this._assertions.push({ field, fn });
|
|
25
35
|
return this;
|
|
26
36
|
}
|
|
27
37
|
toDefinition() {
|
|
28
|
-
return {
|
|
38
|
+
return {
|
|
39
|
+
prompt: this._prompt,
|
|
40
|
+
assertions: [...this._assertions],
|
|
41
|
+
timeout: this._timeout,
|
|
42
|
+
turns: this._turns,
|
|
43
|
+
runs: this._runs,
|
|
44
|
+
suite: this._suite,
|
|
45
|
+
};
|
|
29
46
|
}
|
|
30
47
|
}
|
|
31
48
|
export class AgentContext {
|
|
32
49
|
_executor;
|
|
33
50
|
_name;
|
|
34
51
|
_scenes = [];
|
|
52
|
+
_currentSuite;
|
|
53
|
+
_beforeAllHooks = [];
|
|
54
|
+
_afterAllHooks = [];
|
|
55
|
+
_beforeEachHooks = [];
|
|
56
|
+
_afterEachHooks = [];
|
|
35
57
|
constructor(_executor, _name) {
|
|
36
58
|
this._executor = _executor;
|
|
37
59
|
this._name = _name;
|
|
38
60
|
}
|
|
61
|
+
registerHook(type, fn) {
|
|
62
|
+
this[`_${type}Hooks`].push(fn);
|
|
63
|
+
}
|
|
64
|
+
setSuite(name) {
|
|
65
|
+
this._currentSuite = name;
|
|
66
|
+
}
|
|
67
|
+
clearSuite() {
|
|
68
|
+
this._currentSuite = undefined;
|
|
69
|
+
}
|
|
39
70
|
registerScene(prompt) {
|
|
40
71
|
const builder = new SceneBuilder(prompt);
|
|
72
|
+
if (this._currentSuite) {
|
|
73
|
+
builder._setSuite(this._currentSuite);
|
|
74
|
+
}
|
|
41
75
|
this._scenes.push(builder);
|
|
42
76
|
return builder;
|
|
43
77
|
}
|
|
@@ -47,32 +81,82 @@ export class AgentContext {
|
|
|
47
81
|
const definitions = this._scenes.map((s) => s.toDefinition());
|
|
48
82
|
const orderedResults = new Array(definitions.length);
|
|
49
83
|
const total = definitions.length;
|
|
50
|
-
|
|
51
|
-
const
|
|
84
|
+
// Group scenes by suite for organized output
|
|
85
|
+
const suiteNames = [...new Set(definitions.map((d) => d.suite).filter(Boolean))];
|
|
86
|
+
const hasSuites = suiteNames.length > 0;
|
|
87
|
+
const suiteCount = hasSuites ? ` (${suiteNames.length} suite${suiteNames.length !== 1 ? "s" : ""})` : "";
|
|
88
|
+
logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${suiteCount}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
|
|
89
|
+
// Run beforeAll hooks
|
|
90
|
+
for (const hook of this._beforeAllHooks) {
|
|
91
|
+
await hook();
|
|
92
|
+
}
|
|
93
|
+
const buildTask = (scene, i) => async () => {
|
|
52
94
|
const label = scene.prompt.length > 60
|
|
53
95
|
? scene.prompt.slice(0, 57) + "..."
|
|
54
96
|
: scene.prompt;
|
|
97
|
+
// Run beforeEach hooks
|
|
98
|
+
for (const hook of this._beforeEachHooks) {
|
|
99
|
+
await hook();
|
|
100
|
+
}
|
|
55
101
|
const result = await executeScene(this._executor, scene, config.timeout, config.judge, config.turns);
|
|
56
102
|
orderedResults[i] = result;
|
|
103
|
+
// Run afterEach hooks
|
|
104
|
+
for (const hook of this._afterEachHooks) {
|
|
105
|
+
await hook();
|
|
106
|
+
}
|
|
57
107
|
const ms = result.duration.toFixed(0);
|
|
108
|
+
const runsLabel = result.runs ? c.dim(` [${result.runs.filter(r => r.passed).length}/${result.runs.length} passed]`) : "";
|
|
109
|
+
const indent = hasSuites ? " " : " ";
|
|
58
110
|
if (result.passed) {
|
|
59
|
-
logger.info(
|
|
111
|
+
logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
|
|
60
112
|
}
|
|
61
113
|
else if (result.judgement?.verdict === "partial") {
|
|
62
|
-
logger.info(
|
|
114
|
+
logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
|
|
63
115
|
if (result.error) {
|
|
64
|
-
logger.info(
|
|
116
|
+
logger.info(`${indent} ${c.yellow(result.error)}`);
|
|
65
117
|
}
|
|
66
118
|
}
|
|
67
119
|
else {
|
|
68
|
-
logger.info(
|
|
120
|
+
logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
|
|
69
121
|
if (result.error) {
|
|
70
|
-
logger.info(
|
|
122
|
+
logger.info(`${indent} ${c.red(result.error)}`);
|
|
71
123
|
}
|
|
72
124
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
125
|
+
if (result.statisticalSignificance != null) {
|
|
126
|
+
const sig = result.statisticalSignificance;
|
|
127
|
+
const sigColor = sig >= 0.95 ? c.green : sig >= 0.80 ? c.yellow : c.red;
|
|
128
|
+
logger.info(`${indent} ${c.dim("significance:")} ${sigColor(`${(sig * 100).toFixed(1)}%`)} ${c.dim(`(pass rate: ${((result.passRate ?? 0) * 100).toFixed(1)}%)`)}`);
|
|
129
|
+
}
|
|
130
|
+
logger.debug(`${indent} response: ${result.response.text?.slice(0, 120)}`);
|
|
131
|
+
};
|
|
132
|
+
if (hasSuites) {
|
|
133
|
+
// Execute suite by suite — print header once, then run all scenes in that suite
|
|
134
|
+
for (const suiteName of suiteNames) {
|
|
135
|
+
const suiteIndices = definitions
|
|
136
|
+
.map((d, i) => d.suite === suiteName ? i : -1)
|
|
137
|
+
.filter((i) => i !== -1);
|
|
138
|
+
logger.info(` ${c.bold(c.cyan(`▸ ${suiteName}`))} ${c.dim(`(${suiteIndices.length} scene${suiteIndices.length !== 1 ? "s" : ""})`)}`);
|
|
139
|
+
const tasks = suiteIndices.map((i) => buildTask(definitions[i], i));
|
|
140
|
+
await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
|
|
141
|
+
logger.info("");
|
|
142
|
+
}
|
|
143
|
+
// Run any scenes not in a suite
|
|
144
|
+
const unsuitedIndices = definitions
|
|
145
|
+
.map((d, i) => d.suite ? -1 : i)
|
|
146
|
+
.filter((i) => i !== -1);
|
|
147
|
+
if (unsuitedIndices.length > 0) {
|
|
148
|
+
const tasks = unsuitedIndices.map((i) => buildTask(definitions[i], i));
|
|
149
|
+
await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
const tasks = definitions.map((scene, i) => buildTask(scene, i));
|
|
154
|
+
await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
|
|
155
|
+
}
|
|
156
|
+
// Run afterAll hooks
|
|
157
|
+
for (const hook of this._afterAllHooks) {
|
|
158
|
+
await hook();
|
|
159
|
+
}
|
|
76
160
|
const results = orderedResults;
|
|
77
161
|
let totalDuration = results.reduce((sum, r) => sum + r.duration, 0);
|
|
78
162
|
logger.info("");
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { AgentExecutor, AgentReport } from "./types";
|
|
1
|
+
import type { AgentExecutor, AgentReport, HookFn } from "./types";
|
|
2
2
|
import { SceneBuilder } from "./context";
|
|
3
3
|
export { expect } from "./assertions";
|
|
4
4
|
export { logger } from "./logger";
|
|
@@ -7,11 +7,16 @@ export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
|
|
|
7
7
|
export type { LogLevel } from "./logger";
|
|
8
8
|
export type { AgentExpectation, AgentMatchers } from "./assertions";
|
|
9
9
|
export type { JudgeCriteria } from "./judge";
|
|
10
|
-
export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, JudgeVerdict, JudgeResult, } from "./types";
|
|
10
|
+
export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
|
|
11
11
|
export interface AgentOptions {
|
|
12
12
|
name?: string;
|
|
13
13
|
}
|
|
14
14
|
export declare function scene(prompt: string): SceneBuilder;
|
|
15
|
+
export declare function beforeAll(fn: HookFn): void;
|
|
16
|
+
export declare function afterAll(fn: HookFn): void;
|
|
17
|
+
export declare function beforeEach(fn: HookFn): void;
|
|
18
|
+
export declare function afterEach(fn: HookFn): void;
|
|
19
|
+
export declare function suite(name: string, fn: () => void): void;
|
|
15
20
|
/** @internal reset auto-run state between tests */
|
|
16
21
|
export declare function _resetAutoRun(): void;
|
|
17
22
|
export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;
|
package/dist/index.js
CHANGED
|
@@ -5,12 +5,36 @@ export { defineConfig } from "./config";
|
|
|
5
5
|
export function scene(prompt) {
|
|
6
6
|
return getContext().registerScene(prompt);
|
|
7
7
|
}
|
|
8
|
+
export function beforeAll(fn) {
|
|
9
|
+
getContext().registerHook("beforeAll", fn);
|
|
10
|
+
}
|
|
11
|
+
export function afterAll(fn) {
|
|
12
|
+
getContext().registerHook("afterAll", fn);
|
|
13
|
+
}
|
|
14
|
+
export function beforeEach(fn) {
|
|
15
|
+
getContext().registerHook("beforeEach", fn);
|
|
16
|
+
}
|
|
17
|
+
export function afterEach(fn) {
|
|
18
|
+
getContext().registerHook("afterEach", fn);
|
|
19
|
+
}
|
|
20
|
+
export function suite(name, fn) {
|
|
21
|
+
const ctx = getContext();
|
|
22
|
+
ctx.setSuite(name);
|
|
23
|
+
try {
|
|
24
|
+
fn();
|
|
25
|
+
}
|
|
26
|
+
finally {
|
|
27
|
+
ctx.clearSuite();
|
|
28
|
+
}
|
|
29
|
+
}
|
|
8
30
|
const pendingAgents = [];
|
|
9
31
|
let autoRunScheduled = false;
|
|
32
|
+
let executionChain = Promise.resolve();
|
|
10
33
|
/** @internal reset auto-run state between tests */
|
|
11
34
|
export function _resetAutoRun() {
|
|
12
35
|
pendingAgents.length = 0;
|
|
13
36
|
autoRunScheduled = false;
|
|
37
|
+
executionChain = Promise.resolve();
|
|
14
38
|
}
|
|
15
39
|
export function agent(executor, fn, options) {
|
|
16
40
|
const ctx = new AgentContext(executor, options?.name);
|
|
@@ -23,7 +47,8 @@ export function agent(executor, fn, options) {
|
|
|
23
47
|
return Promise.reject(err);
|
|
24
48
|
}
|
|
25
49
|
setContext(null);
|
|
26
|
-
const promise = ctx.execute();
|
|
50
|
+
const promise = executionChain.then(() => ctx.execute());
|
|
51
|
+
executionChain = promise.then(() => { }, () => { });
|
|
27
52
|
pendingAgents.push(promise);
|
|
28
53
|
if (!autoRunScheduled) {
|
|
29
54
|
autoRunScheduled = true;
|
package/dist/preview.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
declare function main(): Promise<void>;
|
|
2
|
+
export { main };
|