@sebastiantuyu/agest 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sebastian Tuyu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -116,11 +116,13 @@ npx tsx examples/openrouter.test.ts
116
116
  - [x] Remote HTTP adapter for framework-agnostic testing
117
117
  - [x] Report persistence to `.reports/` with YAML format
118
118
  - [x] Stats CLI with multi-model comparison and dimension analysis
119
+ - [x] Lifecycle hooks: `beforeEach`, `beforeAll`, `afterEach`, `afterAll` supporting sync/async functions
120
+ - [x] Multiple test suites per agent via `suite()` to evaluate different aspects independently
121
+ - [x] Statistical runs: `.runs(n)` per scene with pass rate and Wilson significance scoring
119
122
 
120
123
  ### Up next
121
124
  - [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
122
125
  - [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
123
- - [ ] Statistical runs: `.runs(n)` per scene with mean/stddev reporting
124
126
  - [ ] Vercel AI SDK adapter
125
127
  - [ ] Snapshot regression: diff current run against a saved baseline
126
128
 
package/dist/context.d.ts CHANGED
@@ -1,12 +1,17 @@
1
- import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
1
+ import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
2
2
  export declare class SceneBuilder {
3
3
  private _prompt;
4
4
  private _assertions;
5
5
  private _timeout?;
6
6
  private _turns?;
7
+ private _runs?;
8
+ private _suite?;
7
9
  constructor(_prompt: string);
8
10
  timeout(ms: number): SceneBuilder;
9
11
  turns(n: number): SceneBuilder;
12
+ runs(n: number): SceneBuilder;
13
+ /** @internal */
14
+ _setSuite(name: string): void;
10
15
  expect(field: string, fn: (value: any) => void): SceneBuilder;
11
16
  toDefinition(): SceneDefinition;
12
17
  }
@@ -14,7 +19,15 @@ export declare class AgentContext {
14
19
  private _executor;
15
20
  private _name?;
16
21
  private _scenes;
22
+ private _currentSuite?;
23
+ private _beforeAllHooks;
24
+ private _afterAllHooks;
25
+ private _beforeEachHooks;
26
+ private _afterEachHooks;
17
27
  constructor(_executor: AgentExecutor, _name?: string | undefined);
28
+ registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
29
+ setSuite(name: string): void;
30
+ clearSuite(): void;
18
31
  registerScene(prompt: string): SceneBuilder;
19
32
  execute(): Promise<AgentReport>;
20
33
  }
package/dist/context.js CHANGED
@@ -9,6 +9,8 @@ export class SceneBuilder {
9
9
  _assertions = [];
10
10
  _timeout;
11
11
  _turns;
12
+ _runs;
13
+ _suite;
12
14
  constructor(_prompt) {
13
15
  this._prompt = _prompt;
14
16
  }
@@ -20,24 +22,56 @@ export class SceneBuilder {
20
22
  this._turns = n;
21
23
  return this;
22
24
  }
25
+ runs(n) {
26
+ this._runs = Math.max(1, Math.round(n));
27
+ return this;
28
+ }
29
+ /** @internal */
30
+ _setSuite(name) {
31
+ this._suite = name;
32
+ }
23
33
  expect(field, fn) {
24
34
  this._assertions.push({ field, fn });
25
35
  return this;
26
36
  }
27
37
  toDefinition() {
28
- return { prompt: this._prompt, assertions: [...this._assertions], timeout: this._timeout, turns: this._turns };
38
+ return {
39
+ prompt: this._prompt,
40
+ assertions: [...this._assertions],
41
+ timeout: this._timeout,
42
+ turns: this._turns,
43
+ runs: this._runs,
44
+ suite: this._suite,
45
+ };
29
46
  }
30
47
  }
31
48
  export class AgentContext {
32
49
  _executor;
33
50
  _name;
34
51
  _scenes = [];
52
+ _currentSuite;
53
+ _beforeAllHooks = [];
54
+ _afterAllHooks = [];
55
+ _beforeEachHooks = [];
56
+ _afterEachHooks = [];
35
57
  constructor(_executor, _name) {
36
58
  this._executor = _executor;
37
59
  this._name = _name;
38
60
  }
61
+ registerHook(type, fn) {
62
+ this[`_${type}Hooks`].push(fn);
63
+ }
64
+ setSuite(name) {
65
+ this._currentSuite = name;
66
+ }
67
+ clearSuite() {
68
+ this._currentSuite = undefined;
69
+ }
39
70
  registerScene(prompt) {
40
71
  const builder = new SceneBuilder(prompt);
72
+ if (this._currentSuite) {
73
+ builder._setSuite(this._currentSuite);
74
+ }
41
75
  this._scenes.push(builder);
42
76
  return builder;
43
77
  }
@@ -47,32 +81,82 @@ export class AgentContext {
47
81
  const definitions = this._scenes.map((s) => s.toDefinition());
48
82
  const orderedResults = new Array(definitions.length);
49
83
  const total = definitions.length;
50
- logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
51
- const tasks = definitions.map((scene, i) => async () => {
84
+ // Group scenes by suite for organized output
85
+ const suiteNames = [...new Set(definitions.map((d) => d.suite).filter(Boolean))];
86
+ const hasSuites = suiteNames.length > 0;
87
+ const suiteCount = hasSuites ? ` (${suiteNames.length} suite${suiteNames.length !== 1 ? "s" : ""})` : "";
88
+ logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${suiteCount}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
89
+ // Run beforeAll hooks
90
+ for (const hook of this._beforeAllHooks) {
91
+ await hook();
92
+ }
93
+ const buildTask = (scene, i) => async () => {
52
94
  const label = scene.prompt.length > 60
53
95
  ? scene.prompt.slice(0, 57) + "..."
54
96
  : scene.prompt;
97
+ // Run beforeEach hooks
98
+ for (const hook of this._beforeEachHooks) {
99
+ await hook();
100
+ }
55
101
  const result = await executeScene(this._executor, scene, config.timeout, config.judge, config.turns);
56
102
  orderedResults[i] = result;
103
+ // Run afterEach hooks
104
+ for (const hook of this._afterEachHooks) {
105
+ await hook();
106
+ }
57
107
  const ms = result.duration.toFixed(0);
108
+ const runsLabel = result.runs ? c.dim(` [${result.runs.filter(r => r.passed).length}/${result.runs.length} passed]`) : "";
109
+ const indent = hasSuites ? " " : " ";
58
110
  if (result.passed) {
59
- logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}`);
111
+ logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
60
112
  }
61
113
  else if (result.judgement?.verdict === "partial") {
62
- logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}`);
114
+ logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
63
115
  if (result.error) {
64
- logger.info(` ${c.yellow(result.error)}`);
116
+ logger.info(`${indent} ${c.yellow(result.error)}`);
65
117
  }
66
118
  }
67
119
  else {
68
- logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}`);
120
+ logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
69
121
  if (result.error) {
70
- logger.info(` ${c.red(result.error)}`);
122
+ logger.info(`${indent} ${c.red(result.error)}`);
71
123
  }
72
124
  }
73
- logger.debug(` response: ${result.response.text?.slice(0, 120)}`);
74
- });
75
- await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
125
+ if (result.statisticalSignificance != null) {
126
+ const sig = result.statisticalSignificance;
127
+ const sigColor = sig >= 0.95 ? c.green : sig >= 0.80 ? c.yellow : c.red;
128
+ logger.info(`${indent} ${c.dim("significance:")} ${sigColor(`${(sig * 100).toFixed(1)}%`)} ${c.dim(`(pass rate: ${((result.passRate ?? 0) * 100).toFixed(1)}%)`)}`);
129
+ }
130
+ logger.debug(`${indent} response: ${result.response.text?.slice(0, 120)}`);
131
+ };
132
+ if (hasSuites) {
133
+ // Execute suite by suite — print header once, then run all scenes in that suite
134
+ for (const suiteName of suiteNames) {
135
+ const suiteIndices = definitions
136
+ .map((d, i) => d.suite === suiteName ? i : -1)
137
+ .filter((i) => i !== -1);
138
+ logger.info(` ${c.bold(c.cyan(`▸ ${suiteName}`))} ${c.dim(`(${suiteIndices.length} scene${suiteIndices.length !== 1 ? "s" : ""})`)}`);
139
+ const tasks = suiteIndices.map((i) => buildTask(definitions[i], i));
140
+ await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
141
+ logger.info("");
142
+ }
143
+ // Run any scenes not in a suite
144
+ const unsuitedIndices = definitions
145
+ .map((d, i) => d.suite ? -1 : i)
146
+ .filter((i) => i !== -1);
147
+ if (unsuitedIndices.length > 0) {
148
+ const tasks = unsuitedIndices.map((i) => buildTask(definitions[i], i));
149
+ await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
150
+ }
151
+ }
152
+ else {
153
+ const tasks = definitions.map((scene, i) => buildTask(scene, i));
154
+ await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
155
+ }
156
+ // Run afterAll hooks
157
+ for (const hook of this._afterAllHooks) {
158
+ await hook();
159
+ }
76
160
  const results = orderedResults;
77
161
  let totalDuration = results.reduce((sum, r) => sum + r.duration, 0);
78
162
  logger.info("");
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import type { AgentExecutor, AgentReport } from "./types";
1
+ import type { AgentExecutor, AgentReport, HookFn } from "./types";
2
2
  import { SceneBuilder } from "./context";
3
3
  export { expect } from "./assertions";
4
4
  export { logger } from "./logger";
@@ -7,11 +7,16 @@ export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
7
7
  export type { LogLevel } from "./logger";
8
8
  export type { AgentExpectation, AgentMatchers } from "./assertions";
9
9
  export type { JudgeCriteria } from "./judge";
10
- export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, JudgeVerdict, JudgeResult, } from "./types";
10
+ export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
11
11
  export interface AgentOptions {
12
12
  name?: string;
13
13
  }
14
14
  export declare function scene(prompt: string): SceneBuilder;
15
+ export declare function beforeAll(fn: HookFn): void;
16
+ export declare function afterAll(fn: HookFn): void;
17
+ export declare function beforeEach(fn: HookFn): void;
18
+ export declare function afterEach(fn: HookFn): void;
19
+ export declare function suite(name: string, fn: () => void): void;
15
20
  /** @internal reset auto-run state between tests */
16
21
  export declare function _resetAutoRun(): void;
17
22
  export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;
package/dist/index.js CHANGED
@@ -5,12 +5,36 @@ export { defineConfig } from "./config";
5
5
  export function scene(prompt) {
6
6
  return getContext().registerScene(prompt);
7
7
  }
8
+ export function beforeAll(fn) {
9
+ getContext().registerHook("beforeAll", fn);
10
+ }
11
+ export function afterAll(fn) {
12
+ getContext().registerHook("afterAll", fn);
13
+ }
14
+ export function beforeEach(fn) {
15
+ getContext().registerHook("beforeEach", fn);
16
+ }
17
+ export function afterEach(fn) {
18
+ getContext().registerHook("afterEach", fn);
19
+ }
20
+ export function suite(name, fn) {
21
+ const ctx = getContext();
22
+ ctx.setSuite(name);
23
+ try {
24
+ fn();
25
+ }
26
+ finally {
27
+ ctx.clearSuite();
28
+ }
29
+ }
8
30
  const pendingAgents = [];
9
31
  let autoRunScheduled = false;
32
+ let executionChain = Promise.resolve();
10
33
  /** @internal reset auto-run state between tests */
11
34
  export function _resetAutoRun() {
12
35
  pendingAgents.length = 0;
13
36
  autoRunScheduled = false;
37
+ executionChain = Promise.resolve();
14
38
  }
15
39
  export function agent(executor, fn, options) {
16
40
  const ctx = new AgentContext(executor, options?.name);
@@ -23,7 +47,8 @@ export function agent(executor, fn, options) {
23
47
  return Promise.reject(err);
24
48
  }
25
49
  setContext(null);
26
- const promise = ctx.execute();
50
+ const promise = executionChain.then(() => ctx.execute());
51
+ executionChain = promise.then(() => { }, () => { });
27
52
  pendingAgents.push(promise);
28
53
  if (!autoRunScheduled) {
29
54
  autoRunScheduled = true;